From 8173dc600ad68872f4e488c753f59b8a1e2093aa Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 28 Nov 2024 08:32:22 +0200
Subject: [PATCH 001/117] proxy: spawn cancellation checks in the background
 (#9918)

## Problem
For cancellation, a connection is open during all the cancel checks.
## Summary of changes
Spawn cancellation checks in the background, and close connection
immediately.
Use task_tracker for cancellation checks.
---
 proxy/src/cancellation.rs           | 15 ++++++++-----
 proxy/src/console_redirect_proxy.rs | 35 +++++++++++++++++++++--------
 proxy/src/proxy/mod.rs              | 35 +++++++++++++++++++++--------
 proxy/src/redis/notifications.rs    |  2 +-
 proxy/src/serverless/mod.rs         |  9 ++++++++
 proxy/src/serverless/websocket.rs   |  3 +++
 6 files changed, 75 insertions(+), 24 deletions(-)
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74415f1ffe..91e198bf88 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -99,16 +99,17 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
     /// check_allowed - if true, check if the IP is allowed to cancel the query
+    /// return Result primarily for tests
     pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
-        peer_addr: &IpAddr,
+        peer_addr: IpAddr,
         check_allowed: bool,
     ) -> Result<(), CancelError> {
         // TODO: check for unspecified address is only for backward compatibility, should be removed
         if !peer_addr.is_unspecified() {
-            let subnet_key = match *peer_addr {
+            let subnet_key = match peer_addr {
                 IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
                 IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
             };
@@ -141,9 +142,11 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 return Ok(());
             }
 
-            match self.client.try_publish(key, session_id, *peer_addr).await {
+            match self.client.try_publish(key, session_id, peer_addr).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
+                    // log it here since cancel_session could be spawned in a task
+                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
                     return Err(CancelError::IO(std::io::Error::new(
                         std::io::ErrorKind::Other,
                         e.to_string(),
@@ -154,8 +157,10 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         };
 
         if check_allowed
-            && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice())
+            && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice())
         {
+            // log it here since cancel_session could be spawned in a task
+            tracing::warn!("IP is not allowed to cancel the query: {key}");
             return Err(CancelError::IpNotAllowed);
         }
 
@@ -306,7 +311,7 @@ mod tests {
                     cancel_key: 0,
                 },
                 Uuid::new_v4(),
-                &("127.0.0.1".parse().unwrap()),
+                "127.0.0.1".parse().unwrap(),
                 true,
             )
             .await
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index b910b524b1..8f78df1964 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -35,6 +35,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -48,6 +49,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
@@ -96,6 +98,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -127,10 +130,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -142,6 +147,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -161,15 +167,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 7fe67e43de..956036d29d 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -69,6 +69,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -82,6 +83,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
@@ -133,6 +135,7 @@ pub async fn task_main(
                 ClientMode::Tcp,
                 endpoint_rate_limiter2,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -164,10 +167,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -250,6 +255,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -270,15 +276,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 65008ae943..9ac07b7e90 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -149,7 +149,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     .cancel_session(
                         cancel_session.cancel_key_data,
                         uuid::Uuid::nil(),
-                        &peer_addr,
+                        peer_addr,
                         cancel_session.peer_addr.is_some(),
                     )
                     .await
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 77025f419d..80b42f9e55 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -132,6 +132,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
         if let Err(e) = conn.set_nodelay(true) {
@@ -160,6 +161,7 @@ pub async fn task_main(
         let connections2 = connections.clone();
         let cancellation_handler = cancellation_handler.clone();
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+        let cancellations = cancellations.clone();
         connections.spawn(
             async move {
                 let conn_token2 = conn_token.clone();
@@ -188,6 +190,7 @@ pub async fn task_main(
                     config,
                     backend,
                     connections2,
+                    cancellations,
                     cancellation_handler,
                     endpoint_rate_limiter,
                     conn_token,
@@ -313,6 +316,7 @@ async fn connection_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
+    cancellations: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
@@ -353,6 +357,7 @@ async fn connection_handler(
 
             // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
             // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let cancellations = cancellations.clone();
             let handler = connections.spawn(
                 request_handler(
                     req,
@@ -364,6 +369,7 @@ async fn connection_handler(
                     conn_info2.clone(),
                     http_request_token,
                     endpoint_rate_limiter.clone(),
+                    cancellations,
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
@@ -411,6 +417,7 @@ async fn request_handler(
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellations: TaskTracker,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let host = request
         .headers()
@@ -436,6 +443,7 @@ async fn request_handler(
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
+        let cancellations = cancellations.clone();
         ws_connections.spawn(
             async move {
                 if let Err(e) = websocket::serve_websocket(
@@ -446,6 +454,7 @@ async fn request_handler(
                     cancellation_handler,
                     endpoint_rate_limiter,
                     host,
+                    cancellations,
                 )
                 .await
                 {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 4088fea835..bdb83fe6be 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -123,6 +123,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -131,6 +132,7 @@ pub(crate) async fn serve_websocket(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
@@ -149,6 +151,7 @@ pub(crate) async fn serve_websocket(
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
         conn_gauge,
+        cancellations,
     ))
     .await;
 

From e82f7f0dfc1571ddbbb4ff37c1c94579a7101834 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 10:11:08 +0000
Subject: [PATCH 002/117] remote_storage/abs: count 404 and 304 for get as ok
 for metrics (#9912)

## Problem

We currently see elevated levels of errors for GetBlob requests. This is
because 404 and 304 are counted as errors for metric reporting.

## Summary of Changes

Bring the implementation in line with the S3 client and treat 404 and
304 responses as ok for metric purposes.

Related: https://github.com/neondatabase/cloud/issues/20666
---
 libs/remote_storage/src/azure_blob.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index ae0a94295c..840917ef68 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -220,6 +220,11 @@ impl AzureBlobStorage {
         let started_at = ScopeGuard::into_inner(started_at);
         let outcome = match &download {
             Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
             Err(_) => AttemptOutcome::Err,
         };
         crate::metrics::BUCKET_METRICS

From 70780e310c9640650eeb8b5cb0838bebd1c6c0ff Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 28 Nov 2024 16:48:18 +0100
Subject: [PATCH 003/117] Makefile: build pg_visibility (#9922)

Build the `pg_visibility` extension for use with `neon_local`. This is
useful to inspect the visibility map for debugging.

Touches #9914.
---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index dc67b87239..9cffc74508 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pg_visibility $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"

From eb5d832e6fea0e1c3c14b9e6024fce916c3f1c32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:49:30 +0100
Subject: [PATCH 004/117] Update rust to 1.83.0, also update cargo adjacent
 tools (#9926)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.83.0/).

Also update `cargo-hakari`, `cargo-deny`, `cargo-hack` and
`cargo-nextest` to their latest versions.

Prior update was in #9445.
---
 build-tools.Dockerfile | 18 +++++++++---------
 rust-toolchain.toml    |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 4f491afec5..2671702697 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,9 +57,9 @@ RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
     chmod -R 755 /pgcopydb && \
     chown -R nonroot:nonroot /pgcopydb
-        
-COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
-COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
+
+COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
+COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 # System deps
 #
@@ -258,14 +258,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.82.0
+ENV RUSTC_VERSION=1.83.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.30
-ARG CARGO_DENY_VERSION=0.16.1
-ARG CARGO_HACK_VERSION=0.6.31
-ARG CARGO_NEXTEST_VERSION=0.9.72
+ARG CARGO_HAKARI_VERSION=0.9.33
+ARG CARGO_DENY_VERSION=0.16.2
+ARG CARGO_HACK_VERSION=0.6.33
+ARG CARGO_NEXTEST_VERSION=0.9.85
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -289,7 +289,7 @@ RUN whoami \
     && cargo --version --verbose \
     && rustup --version --verbose \
     && rustc --version --verbose \
-    && clang --version 
+    && clang --version
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
     LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 92b7929c7f..f0661a32e0 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.82.0"
+channel = "1.83.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From eb520a14ce12dc16f33f39964632982f6c14b9f3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 17:38:47 +0000
Subject: [PATCH 005/117] pageserver: return correct LSN for interpreted proto
 keep alive responses (#9928)

## Problem

For the interpreted proto the pageserver is not returning the correct
LSN
in replies to keep alive requests. This is because the interpreted
protocol arm
was not updating `last_rec_lsn`.

## Summary of changes

* Return correct LSN in keep-alive responses
* Fix shard field in wal sender traces
---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs    | 4 ++++
 safekeeper/src/handler.rs                                    | 5 +++--
 safekeeper/src/wal_service.rs                                | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 31cf1b6307..d90ffbfa2c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -454,6 +454,10 @@ pub(super) async fn handle_walreceiver_connection(
                     timeline.get_last_record_lsn()
                 );
 
+                if let Some(lsn) = next_record_lsn {
+                    last_rec_lsn = lsn;
+                }
+
                 Some(streaming_lsn)
             }
 
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 22f33b17e0..8dd2929a03 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -212,8 +212,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                 );
 
             if let Some(shard) = self.shard.as_ref() {
-                tracing::Span::current()
-                    .record("shard", tracing::field::display(shard.shard_slug()));
+                if let Some(slug) = shard.shard_slug().strip_prefix("-") {
+                    tracing::Span::current().record("shard", tracing::field::display(slug));
+                }
             }
 
             Ok(())
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ab54d4cce..5248d545db 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -44,7 +44,7 @@ pub async fn task_main(
                     error!("connection handler exited: {}", err);
                 }
             }
-            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty, shard = field::Empty)),
         );
     }
 }

From e04dd3be0b6bd6702fa6e3301c9b7202d72ccc1c Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 28 Nov 2024 19:02:57 +0000
Subject: [PATCH 006/117] test_runner: rerun all failed tests (#9917)

## Problem

Currently, we rerun only known flaky tests. This approach was chosen to
reduce the number of tests that go unnoticed (by forcing people to take
a look at failed tests and rerun the job manually), but it has some
drawbacks:
- In PRs, people tend to push new changes without checking failed tests
(that's ok)
- In the main, tests are just restarted without checking
(understandable)
- Parametrised tests become flaky one by one, i.e. if `test[1]` is flaky
`, test[2]` is not marked as flaky automatically (which may or may not
be the case).

I suggest rerunning all failed tests to increase the stability of GitHub
jobs and using the Grafana Dashboard with flaky tests for deeper
analysis.

## Summary of changes
- Rerun all failed tests twice at max
---
 .../actions/run-python-test-set/action.yml    |  17 +-
 .github/workflows/_build-and-test-locally.yml |   2 +-
 poetry.lock                                   |  12 +-
 pyproject.toml                                |   2 +-
 scripts/flaky_tests.py                        | 147 ------------------
 test_runner/conftest.py                       |   2 +-
 test_runner/fixtures/flaky.py                 |  78 ----------
 test_runner/fixtures/paths.py                 |   2 +-
 test_runner/fixtures/reruns.py                |  31 ++++
 9 files changed, 46 insertions(+), 247 deletions(-)
 delete mode 100755 scripts/flaky_tests.py
 delete mode 100644 test_runner/fixtures/flaky.py
 create mode 100644 test_runner/fixtures/reruns.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 275f161019..1159627302 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,8 +36,8 @@ inputs:
     description: 'Region name for real s3 tests'
     required: false
     default: ''
-  rerun_flaky:
-    description: 'Whether to rerun flaky tests'
+  rerun_failed:
+    description: 'Whether to rerun failed tests'
     required: false
     default: 'false'
   pg_version:
@@ -108,7 +108,7 @@ runs:
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
-        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
@@ -154,15 +154,8 @@ runs:
           EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
         fi
 
-        if [ "${RERUN_FLAKY}" == "true" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
-
-          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        if [ "${RERUN_FAILED}" == "true" ]; then
+          EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS"
         fi
 
         # We use pytest-split plugin to run benchmarks in parallel on different CI runners
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index bdf7c07c6a..42c32a23e3 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -293,7 +293,7 @@ jobs:
           run_with_real_s3: true
           real_s3_bucket: neon-github-ci-tests
           real_s3_region: eu-central-1
-          rerun_flaky: true
+          rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
diff --git a/poetry.lock b/poetry.lock
index e2fca7be47..59ae5cf1ca 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2563,18 +2563,18 @@ pytest = "*"
 
 [[package]]
 name = "pytest-rerunfailures"
-version = "13.0"
+version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
-    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
+    {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
+    {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
 ]
 
 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=7"
+pytest = ">=7.4,<8.2.2 || >8.2.2"
 
 [[package]]
 name = "pytest-split"
@@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"
+content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9"
diff --git a/pyproject.toml b/pyproject.toml
index ccd3ab1864..01d15ee6bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.10.11"
-pytest-rerunfailures = "^13.0"
+pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
deleted file mode 100755
index 3fb668ed2d..0000000000
--- a/scripts/flaky_tests.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#! /usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-from collections import defaultdict
-from typing import TYPE_CHECKING
-
-import psycopg2
-import psycopg2.extras
-import toml
-
-if TYPE_CHECKING:
-    from typing import Any
-
-FLAKY_TESTS_QUERY = """
-    SELECT
-        DISTINCT parent_suite, suite, name
-    FROM results
-    WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR flaky
-        )
-    ;
-"""
-
-
-def main(args: argparse.Namespace):
-    connstr = args.connstr
-    interval_days = args.days
-    output = args.output
-
-    build_type = args.build_type
-    pg_version = args.pg_version
-
-    res: defaultdict[str, defaultdict[str, dict[str, bool]]]
-    res = defaultdict(lambda: defaultdict(dict))
-
-    try:
-        logging.info("connecting to the database...")
-        with psycopg2.connect(connstr, connect_timeout=30) as conn:
-            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                logging.info("fetching flaky tests...")
-                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
-                rows = cur.fetchall()
-    except psycopg2.OperationalError as exc:
-        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
-        rows = []
-
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
-    # use it to parametrize test name along with build_type and pg_version
-    #
-    # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
-        "",
-        "tokio-epoll-uring",
-    ):
-        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
-    else:
-        pageserver_virtual_file_io_engine_parameter = ""
-
-    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
-    def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
-        """Duplicated from parametrize.py"""
-        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
-        if toml_table is None:
-            return None
-        v = toml.loads(toml_table)
-        assert isinstance(v, dict)
-        return v
-
-    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
-    if (
-        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
-    ) is not None:
-        pageserver_default_tenant_config_compaction_algorithm_parameter = (
-            f"-{explicit_default['kind']}"
-        )
-
-    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        if row["name"].endswith("]"):
-            parametrized_test = row["name"].replace(
-                "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
-            )
-        else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
-
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
-
-    logging.info(f"saving results to {output.name}")
-    json.dump(res, output, indent=2)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
-    parser.add_argument(
-        "--output",
-        type=argparse.FileType("w"),
-        default="flaky.json",
-        help="path to output json file (default: flaky.json)",
-    )
-    parser.add_argument(
-        "--days",
-        required=False,
-        default=10,
-        type=int,
-        help="how many days to look back for flaky tests (default: 10)",
-    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
-    parser.add_argument(
-        "connstr",
-        help="connection string to the test results database",
-    )
-    args = parser.parse_args()
-
-    level = logging.INFO
-    logging.basicConfig(
-        format="%(message)s",
-        level=level,
-    )
-
-    main(args)
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 84eda52d33..887bfef478 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -13,5 +13,5 @@ pytest_plugins = (
     "fixtures.pg_stats",
     "fixtures.compare_fixtures",
     "fixtures.slow",
-    "fixtures.flaky",
+    "fixtures.reruns",
 )
diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py
deleted file mode 100644
index 01634a29c5..0000000000
--- a/test_runner/fixtures/flaky.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-import json
-from collections.abc import MutableMapping
-from pathlib import Path
-from typing import TYPE_CHECKING, cast
-
-import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
-from allure_commons.types import LabelType
-from allure_pytest.utils import allure_name, allure_suite_labels
-
-from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-    from typing import Any
-
-
-"""
-The plugin reruns flaky tests.
-It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
-
-Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
-"""
-
-
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--flaky-tests-json",
-        action="store",
-        type=Path,
-        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
-    )
-
-
-def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
-    if not config.getoption("--flaky-tests-json"):
-        return
-
-    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
-    flaky_json = config.getoption("--flaky-tests-json")
-    if not flaky_json.exists():
-        return
-
-    content = flaky_json.read_text()
-    try:
-        flaky_tests = json.loads(content)
-    except ValueError:
-        log.error(f"Can't parse {content} as json")
-        return
-
-    for item in items:
-        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
-        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
-        allure_labels = dict(allure_suite_labels(item))
-        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
-        suite = str(allure_labels.get(LabelType.SUITE))
-        params = item.callspec.params if hasattr(item, "callspec") else {}
-        name = allure_name(item, params)
-
-        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
-            # Rerun 3 times = 1 original run + 2 reruns
-            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
-            item.add_marker(pytest.mark.flaky(reruns=2))
-
-            # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
-            #   we can workaround it by setting `timeout_func_only` to True[1].
-            # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
-            #   but we still can do it using pytest marker.
-            #
-            # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
-            # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
-            timeout_marker = item.get_closest_marker("timeout")
-            if timeout_marker is not None:
-                kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
-                kwargs["func_only"] = True
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 1c71abea19..80777d65e9 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No
     test_name = request.node.name
     test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}"
 
-    # We rerun flaky tests multiple times, use a separate directory for each run.
+    # We rerun failed tests multiple times, use a separate directory for each run.
     if (suffix := getattr(request.node, "execution_count", None)) is not None:
         test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
 
diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py
new file mode 100644
index 0000000000..f2a25ae8f6
--- /dev/null
+++ b/test_runner/fixtures/reruns.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING, cast
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import Any
+
+    from _pytest.config import Config
+
+
+def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
+    # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
+    #   we can workaround it by setting `timeout_func_only` to True[1].
+    # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
+    #   but we still can do it using pytest marker.
+    #
+    # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
+    # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
+
+    if not config.getoption("--reruns"):
+        return
+
+    for item in items:
+        timeout_marker = item.get_closest_marker("timeout")
+        if timeout_marker is not None:
+            kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
+            kwargs["func_only"] = True

From 42fb3c4d30bf93ad0ad85bbd636a4262d205f673 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 28 Nov 2024 22:38:30 +0100
Subject: [PATCH 007/117] fix(compute_ctl): Allow usage of DB names with
 whitespaces (#9919)

## Problem

We used `set_path()` to replace the database name in the connection
string. It automatically does url-safe encoding if the path is not
already encoded, but it does it as per the URL standard, which assumes
that tabs can be safely removed from the path without changing the
meaning of the URL. See, e.g.,
https://url.spec.whatwg.org/#concept-basic-url-parser. It also breaks
for DBs with properly %-encoded names, like with `%20`, as they are kept
intact, but actually should be escaped.

Yet, this is not true for Postgres, where it's completely valid to have
trailing tabs in the database name.

I think this is the PR that caused this regression
https://github.com/neondatabase/neon/pull/9717, as it switched from
`postgres::config::Config` back to `set_path()`.

This was fixed a while ago already [1], btw, I just haven't added a test
to catch this regression back then :(

## Summary of changes

This commit changes the code back to use
`postgres/tokio_postgres::Config` everywhere.

While on it, also do some changes around, as I had to touch this code:
1. Bump some logging from `debug` to `info` in the spec apply path. We
do not use `debug` in prod, and it was tricky to understand what was
going on with this bug in prod.
2. Refactor configuration concurrency calculation code so it was
reusable. Yet, still keep `1` in the case of reconfiguration. The
database can be actively used at this moment, so we cannot guarantee
that there will be enough spare connection slots, and the underlying
code won't handle connection errors properly.
3. Simplify the installed extensions code. It was spawning a blocking
task inside async function, which doesn't make much sense. Instead, just
have a main sync function and call it with `spawn_blocking` in the API
code -- the only place we need it to be async.
4. Add regression python test to cover this and related problems in the
future. Also, add more extensive testing of schema dump and DBs and
roles listing API.

[1]:
https://github.com/neondatabase/neon/commit/4d1e48f3b9a4b7064787513fd2c455f0001f6e18
[2]:
https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org

Resolves neondatabase/cloud#20869
---
 compute_tools/src/catalog.rs                |  39 ++++-
 compute_tools/src/compute.rs                | 153 +++++++++++---------
 compute_tools/src/http/api.rs               |   7 +-
 compute_tools/src/installed_extensions.rs   | 105 +++++---------
 compute_tools/src/pg_helpers.rs             |  11 ++
 test_runner/fixtures/endpoint/http.py       |   6 +-
 test_runner/fixtures/neon_fixtures.py       |  29 ++++
 test_runner/regress/test_compute_catalog.py | 111 +++++++++++++-
 8 files changed, 318 insertions(+), 143 deletions(-)

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 2f6f82dd39..08ae8bf44d 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,4 +1,3 @@
-use compute_api::responses::CatalogObjects;
 use futures::Stream;
 use postgres::NoTls;
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
@@ -13,7 +12,8 @@ use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 
 use crate::compute::ComputeNode;
-use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async};
+use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db};
+use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
     let connstr = compute.connstr.clone();
@@ -43,6 +43,8 @@ pub enum SchemaDumpError {
     DatabaseDoesNotExist,
     #[error("Failed to execute pg_dump.")]
     IO(#[from] std::io::Error),
+    #[error("Unexpected error.")]
+    Unexpected,
 }
 
 // It uses the pg_dump utility to dump the schema of the specified database.
@@ -60,11 +62,38 @@ pub async fn get_database_schema(
     let pgbin = &compute.pgbin;
     let basepath = Path::new(pgbin).parent().unwrap();
     let pgdump = basepath.join("pg_dump");
-    let mut connstr = compute.connstr.clone();
-    connstr.set_path(dbname);
+
+    // Replace the DB in the connection string and disable it to parts.
+    // This is the only option to handle DBs with special characters.
+    let conf =
+        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let host = conf
+        .get_hosts()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let host = match host {
+        tokio_postgres::config::Host::Tcp(ip) => ip.to_string(),
+        #[cfg(unix)]
+        tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(),
+    };
+    let port = conf
+        .get_ports()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?;
+    let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?;
+
     let mut cmd = Command::new(pgdump)
+        // XXX: this seems to be the only option to deal with DBs with `=` in the name
+        // See <https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org>
+        .env("PGDATABASE", dbname)
+        .arg("--host")
+        .arg(host)
+        .arg("--port")
+        .arg(port.to_string())
+        .arg("--username")
+        .arg(user)
         .arg("--schema-only")
-        .arg(connstr.as_str())
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
         .kill_on_drop(true)
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4f67425ba8..1a026a4014 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,9 +34,8 @@ use utils::measured_stream::MeasuredReader;
 use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 use tokio::spawn;
-use url::Url;
 
-use crate::installed_extensions::get_installed_extensions_sync;
+use crate::installed_extensions::get_installed_extensions;
 use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -816,30 +815,32 @@ impl ComputeNode {
         Ok(())
     }
 
-    async fn get_maintenance_client(url: &Url) -> Result<tokio_postgres::Client> {
-        let mut connstr = url.clone();
+    async fn get_maintenance_client(
+        conf: &tokio_postgres::Config,
+    ) -> Result<tokio_postgres::Client> {
+        let mut conf = conf.clone();
 
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
+        conf.application_name("apply_config");
 
-        let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await {
+        let (client, conn) = match conf.connect(NoTls).await {
+            // If connection fails, it may be the old node with `zenith_admin` superuser.
+            //
+            // In this case we need to connect with old `zenith_admin` name
+            // and create new user. We cannot simply rename connected user,
+            // but we can create a new one and grant it all privileges.
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
-                    // connect with zenith_admin if cloud_admin could not authenticate
+                    // Connect with zenith_admin if cloud_admin could not authenticate
                     info!(
                         "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                         e
                     );
-                    let mut zenith_admin_connstr = connstr.clone();
-
-                    zenith_admin_connstr
-                        .set_username("zenith_admin")
-                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
-                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                        zenith_admin_conf.connect(NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
 
                     // Disable forwarding so that users don't get a cloud_admin role
@@ -853,8 +854,8 @@ impl ComputeNode {
 
                     drop(client);
 
-                    // reconnect with connstring with expected name
-                    tokio_postgres::connect(connstr.as_str(), NoTls).await?
+                    // Reconnect with connstring with expected name
+                    conf.connect(NoTls).await?
                 }
                 _ => return Err(e.into()),
             },
@@ -885,7 +886,7 @@ impl ComputeNode {
     pub fn apply_spec_sql(
         &self,
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         concurrency: usize,
     ) -> Result<()> {
         let rt = tokio::runtime::Builder::new_multi_thread()
@@ -897,7 +898,7 @@ impl ComputeNode {
 
         rt.block_on(async {
             // Proceed with post-startup configuration. Note, that order of operations is important.
-            let client = Self::get_maintenance_client(&url).await?;
+            let client = Self::get_maintenance_client(&conf).await?;
             let spec = spec.clone();
 
             let databases = get_existing_dbs_async(&client).await?;
@@ -931,7 +932,7 @@ impl ComputeNode {
                 RenameAndDeleteDatabases,
                 CreateAndAlterDatabases,
             ] {
-                debug!("Applying phase {:?}", &phase);
+                info!("Applying phase {:?}", &phase);
                 apply_operations(
                     spec.clone(),
                     ctx.clone(),
@@ -942,6 +943,7 @@ impl ComputeNode {
                 .await?;
             }
 
+            info!("Applying RunInEachDatabase phase");
             let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
 
             let db_processes = spec
@@ -955,7 +957,7 @@ impl ComputeNode {
                     let spec = spec.clone();
                     let ctx = ctx.clone();
                     let jwks_roles = jwks_roles.clone();
-                    let mut url = url.as_ref().clone();
+                    let mut conf = conf.as_ref().clone();
                     let concurrency_token = concurrency_token.clone();
                     let db = db.clone();
 
@@ -964,14 +966,14 @@ impl ComputeNode {
                     match &db {
                         DB::SystemDB => {}
                         DB::UserDB(db) => {
-                            url.set_path(db.name.as_str());
+                            conf.dbname(db.name.as_str());
                         }
                     }
 
-                    let url = Arc::new(url);
+                    let conf = Arc::new(conf);
                     let fut = Self::apply_spec_sql_db(
                         spec.clone(),
-                        url,
+                        conf,
                         ctx.clone(),
                         jwks_roles.clone(),
                         concurrency_token.clone(),
@@ -1017,7 +1019,7 @@ impl ComputeNode {
     /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
     async fn apply_spec_sql_db(
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
         jwks_roles: Arc<HashSet<String>>,
         concurrency_token: Arc<tokio::sync::Semaphore>,
@@ -1046,7 +1048,7 @@ impl ComputeNode {
                 // that database.
                 || async {
                     if client_conn.is_none() {
-                        let db_client = Self::get_maintenance_client(&url).await?;
+                        let db_client = Self::get_maintenance_client(&conf).await?;
                         client_conn.replace(db_client);
                     }
                     let client = client_conn.as_ref().unwrap();
@@ -1061,34 +1063,16 @@ impl ComputeNode {
         Ok::<(), anyhow::Error>(())
     }
 
-    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
-    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        // If connection fails,
-        // it may be the old node with `zenith_admin` superuser.
-        //
-        // In this case we need to connect with old `zenith_admin` name
-        // and create new user. We cannot simply rename connected user,
-        // but we can create a new one and grant it all privileges.
-        let mut url = self.connstr.clone();
-        url.query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
-        let url = Arc::new(url);
-        let spec = Arc::new(
-            compute_state
-                .pspec
-                .as_ref()
-                .expect("spec must be set")
-                .spec
-                .clone(),
-        );
-
-        // Choose how many concurrent connections to use for applying the spec changes.
-        // If the cluster is not currently Running we don't have to deal with user connections,
+    /// Choose how many concurrent connections to use for applying the spec changes.
+    pub fn max_service_connections(
+        &self,
+        compute_state: &ComputeState,
+        spec: &ComputeSpec,
+    ) -> usize {
+        // If the cluster is in Init state we don't have to deal with user connections,
         // and can thus use all `max_connections` connection slots. However, that's generally not
         // very efficient, so we generally still limit it to a smaller number.
-        let max_concurrent_connections = if compute_state.status != ComputeStatus::Running {
+        if compute_state.status == ComputeStatus::Init {
             // If the settings contain 'max_connections', use that as template
             if let Some(config) = spec.cluster.settings.find("max_connections") {
                 config.parse::<usize>().ok()
@@ -1144,10 +1128,29 @@ impl ComputeNode {
                 .map(|val| if val > 1 { val - 1 } else { 1 })
                 .last()
                 .unwrap_or(3)
-        };
+        }
+    }
+
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip_all)]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
+        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+        conf.application_name("apply_config");
+
+        let conf = Arc::new(conf);
+        let spec = Arc::new(
+            compute_state
+                .pspec
+                .as_ref()
+                .expect("spec must be set")
+                .spec
+                .clone(),
+        );
+
+        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
 
         // Merge-apply spec & changes to PostgreSQL state.
-        self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?;
+        self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
 
         if let Some(ref local_proxy) = &spec.clone().local_proxy_config {
             info!("configuring local_proxy");
@@ -1156,12 +1159,11 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
-            let mut connstr = url.as_ref().clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
+            let conf = conf.as_ref().clone();
+            let mut conf = postgres::config::Config::from(conf);
+            conf.application_name("migrations");
 
-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
 
@@ -1222,21 +1224,28 @@ impl ComputeNode {
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
-        // temporarily reset max_cluster_size in config
+
+        // TODO(ololobus): We need a concurrency during reconfiguration as well,
+        // but DB is already running and used by user. We can easily get out of
+        // `max_connections` limit, and the current code won't handle that.
+        // let compute_state = self.state.lock().unwrap().clone();
+        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
+        let max_concurrent_connections = 1;
+
+        // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc...
+        // creating new extensions, roles, etc.
         config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
             self.pg_reload_conf()?;
 
             if spec.mode == ComputeMode::Primary {
-                let mut url = self.connstr.clone();
-                url.query_pairs_mut()
-                    .append_pair("application_name", "apply_config");
-                let url = Arc::new(url);
+                let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                conf.application_name("apply_config");
+                let conf = Arc::new(conf);
 
                 let spec = Arc::new(spec.clone());
 
-                self.apply_spec_sql(spec, url, 1)?;
+                self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
             }
 
             Ok(())
@@ -1362,7 +1371,17 @@ impl ComputeNode {
 
             let connstr = self.connstr.clone();
             thread::spawn(move || {
-                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+                let res = get_installed_extensions(&connstr);
+                match res {
+                    Ok(extensions) => {
+                        info!(
+                            "[NEON_EXT_STAT] {}",
+                            serde_json::to_string(&extensions)
+                                .expect("failed to serialize extensions list")
+                        );
+                    }
+                    Err(err) => error!("could not get installed extensions: {err:?}"),
+                }
             });
         }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 8a047634df..a6c6cff20a 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -296,7 +296,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
 
             let connstr = compute.connstr.clone();
-            let res = crate::installed_extensions::get_installed_extensions(connstr).await;
+            let res = task::spawn_blocking(move || {
+                installed_extensions::get_installed_extensions(&connstr)
+            })
+            .await
+            .unwrap();
+
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
                 Err(e) => render_json_error(
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 79d8b2ca04..f473c29a55 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,17 +2,16 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
-use tracing::info;
-use url::Url;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
-use tokio::task;
 
 use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
+use crate::pg_helpers::postgres_conf_for_db;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -42,75 +41,51 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
-    let mut connstr = connstr.clone();
+pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
+    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+    let databases: Vec<String> = list_dbs(&mut client)?;
 
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let databases: Vec<String> = list_dbs(&mut client)?;
+    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    for db in databases.iter() {
+        let config = postgres_conf_for_db(connstr, db)?;
+        let mut db_client = config.connect(NoTls)?;
+        let extensions: Vec<(String, String)> = db_client
+            .query(
+                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                &[],
+            )?
+            .iter()
+            .map(|row| (row.get("extname"), row.get("extversion")))
+            .collect();
 
-        let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
-        for db in databases.iter() {
-            connstr.set_path(db);
-            let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
-            let extensions: Vec<(String, String)> = db_client
-                .query(
-                    "SELECT extname, extversion FROM pg_catalog.pg_extension;",
-                    &[],
-                )?
-                .iter()
-                .map(|row| (row.get("extname"), row.get("extversion")))
-                .collect();
+        for (extname, v) in extensions.iter() {
+            let version = v.to_string();
 
-            for (extname, v) in extensions.iter() {
-                let version = v.to_string();
+            // increment the number of databases where the version of extension is installed
+            INSTALLED_EXTENSIONS
+                .with_label_values(&[extname, &version])
+                .inc();
 
-                // increment the number of databases where the version of extension is installed
-                INSTALLED_EXTENSIONS
-                    .with_label_values(&[extname, &version])
-                    .inc();
-
-                extensions_map
-                    .entry(extname.to_string())
-                    .and_modify(|e| {
-                        e.versions.insert(version.clone());
-                        // count the number of databases where the extension is installed
-                        e.n_databases += 1;
-                    })
-                    .or_insert(InstalledExtension {
-                        extname: extname.to_string(),
-                        versions: HashSet::from([version.clone()]),
-                        n_databases: 1,
-                    });
-            }
+            extensions_map
+                .entry(extname.to_string())
+                .and_modify(|e| {
+                    e.versions.insert(version.clone());
+                    // count the number of databases where the extension is installed
+                    e.n_databases += 1;
+                })
+                .or_insert(InstalledExtension {
+                    extname: extname.to_string(),
+                    versions: HashSet::from([version.clone()]),
+                    n_databases: 1,
+                });
         }
+    }
 
-        let res = InstalledExtensions {
-            extensions: extensions_map.values().cloned().collect(),
-        };
+    let res = InstalledExtensions {
+        extensions: extensions_map.values().cloned().collect(),
+    };
 
-        Ok(res)
-    })
-    .await?
-}
-
-// Gather info about installed extensions
-pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .expect("failed to create runtime");
-    let result = rt
-        .block_on(crate::installed_extensions::get_installed_extensions(
-            connstr,
-        ))
-        .expect("failed to get installed extensions");
-
-    info!(
-        "[NEON_EXT_STAT] {}",
-        serde_json::to_string(&result).expect("failed to serialize extensions list")
-    );
-    Ok(())
+    Ok(res)
 }
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 4a1e5ee0e8..e03b410699 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,6 +6,7 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
+use std::str::FromStr;
 use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
@@ -13,8 +14,10 @@ use anyhow::{bail, Result};
 use futures::StreamExt;
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
+use postgres::config::Config;
 use tokio::io::AsyncBufReadExt;
 use tokio::time::timeout;
+use tokio_postgres;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};
 
@@ -542,3 +545,11 @@ async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Resu
 
     Ok(())
 }
+
+/// `Postgres::config::Config` handles database names with whitespaces
+/// and special characters properly.
+pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result<Config> {
+    let mut conf = Config::from_str(connstr.as_str())?;
+    conf.dbname(dbname);
+    Ok(conf)
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index db3723b7cc..1cd9158c68 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import urllib.parse
+
 import requests
 from requests.adapters import HTTPAdapter
 
@@ -20,7 +22,9 @@ class EndpointHttpClient(requests.Session):
         return res.json()
 
     def database_schema(self, database: str):
-        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res = self.get(
+            f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}"
+        )
         res.raise_for_status()
         return res.text
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a45a311dc2..1f4d2aa5ec 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3934,6 +3934,35 @@ class Endpoint(PgProtocol, LogUtils):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
+    def respec_deep(self, **kwargs: Any) -> None:
+        """
+        Update the endpoint.json file taking into account nested keys.
+        It does one level deep update. Should enough for most cases.
+        Distinct method from respec() to do not break existing functionality.
+        NOTE: This method also updates the spec.json file, not endpoint.json.
+        We need it because neon_local also writes to spec.json, so intended
+        use-case is i) start endpoint with some config, ii) respec_deep(),
+        iii) call reconfigure() to apply the changes.
+        """
+        config_path = os.path.join(self.endpoint_path(), "spec.json")
+        with open(config_path) as f:
+            data_dict: dict[str, Any] = json.load(f)
+
+        log.info("Current compute spec: %s", json.dumps(data_dict, indent=4))
+
+        for key, value in kwargs.items():
+            if isinstance(value, dict):
+                if key not in data_dict:
+                    data_dict[key] = value
+                else:
+                    data_dict[key] = {**data_dict[key], **value}
+            else:
+                data_dict[key] = value
+
+        with open(config_path, "w") as file:
+            log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
+            json.dump(data_dict, file, indent=4)
+
     # Please note: Migrations only run if pg_skip_catalog_updates is false
     def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index d43c71ceac..b3719a45ed 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -3,13 +3,60 @@ from __future__ import annotations
 import requests
 from fixtures.neon_fixtures import NeonEnv
 
+TEST_DB_NAMES = [
+    {
+        "name": "neondb",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with spaces",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with%20spaces ",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with whitespaces	",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "injective db with spaces'; SELECT pg_sleep(10);",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with #pound-sign and &ampersands=true",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with emoji 🌍",
+        "owner": "cloud_admin",
+    },
+]
+
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
+    """
+    Create a bunch of databases with tricky names and test that we can list them
+    and dump via API.
+    """
     env = neon_simple_env
 
-    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
-    client = endpoint.http_client()
+    endpoint = env.endpoints.create_start("main")
 
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to create some test databases.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    client = endpoint.http_client()
     objects = client.dbs_and_roles()
 
     # Assert that 'cloud_admin' role exists in the 'roles' list
@@ -22,9 +69,24 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         db["name"] == "postgres" for db in objects["databases"]
     ), "The 'postgres' database is missing"
 
-    ddl = client.database_schema(database="postgres")
+    # Check other databases
+    for test_db in TEST_DB_NAMES:
+        db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None)
+        assert db is not None, f"The '{test_db['name']}' database is missing"
+        assert (
+            db["owner"] == test_db["owner"]
+        ), f"The '{test_db['name']}' database has incorrect owner"
 
-    assert "-- PostgreSQL database dump" in ddl
+        ddl = client.database_schema(database=test_db["name"])
+
+        # Check that it looks like a valid PostgreSQL dump
+        assert "-- PostgreSQL database dump" in ddl
+
+        # Check that it doesn't contain health_check and migration traces.
+        # They are only created in system `postgres` database, so by checking
+        # that we ensure that we dump right databases.
+        assert "health_check" not in ddl, f"The '{test_db['name']}' database contains health_check"
+        assert "migration" not in ddl, f"The '{test_db['name']}' database contains migrations data"
 
     try:
         client.database_schema(database="nonexistentdb")
@@ -33,3 +95,44 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         assert (
             e.response.status_code == 404
         ), f"Expected 404 status code, but got {e.response.status_code}"
+
+
+def test_compute_create_databases(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can create and work with databases with special
+    characters (whitespaces, %, tabs, etc.) in the name.
+    """
+    env = neon_simple_env
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start("main")
+
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    for db in TEST_DB_NAMES:
+        # Check that database has a correct name in the system catalog
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],))
+            catalog_db = cursor.fetchone()
+            assert catalog_db is not None
+            assert len(catalog_db) == 1
+            assert catalog_db[0] == db["name"]
+
+        # Check that we can connect to this database without any issues
+        with endpoint.cursor(dbname=db["name"]) as cursor:
+            cursor.execute("SELECT * FROM current_database()")
+            curr_db = cursor.fetchone()
+            assert curr_db is not None
+            assert len(curr_db) == 1
+            assert curr_db[0] == db["name"]

From 3ffe6de0b9a4f49cf18f6a2ebf0fc2c6274dfccd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 29 Nov 2024 10:40:08 +0100
Subject: [PATCH 008/117] test_runner/performance: add logical message ingest
 benchmark (#9749)

Adds a benchmark for logical message WAL ingestion throughput
end-to-end. Logical messages are essentially noops, and thus ignored by
the Pageserver.

Example results from my MacBook, with fsync enabled:

```
postgres_ingest: 14.445 s
safekeeper_ingest: 29.948 s
pageserver_ingest: 30.013 s
pageserver_recover_ingest: 8.633 s
wal_written: 10,340 MB
message_count: 1310720 messages
postgres_throughput: 715 MB/s
safekeeper_throughput: 345 MB/s
pageserver_throughput: 344 MB/s
pageserver_recover_throughput: 1197 MB/s
```

See
https://github.com/neondatabase/neon/issues/9642#issuecomment-2475995205
for running analysis.

Touches #9642.
---
 test_runner/fixtures/neon_fixtures.py         |  31 ++++++
 .../test_ingest_logical_message.py            | 101 ++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 test_runner/performance/test_ingest_logical_message.py

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1f4d2aa5ec..e3c88e9965 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4404,6 +4404,10 @@ class Safekeeper(LogUtils):
         log.info(f"sk {self.id} flush LSN: {flush_lsn}")
         return flush_lsn
 
+    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        return timeline_status.commit_lsn
+
     def pull_timeline(
         self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId
     ) -> dict[str, Any]:
@@ -4949,6 +4953,33 @@ def wait_for_last_flush_lsn(
     return min(results)
 
 
+def wait_for_commit_lsn(
+    env: NeonEnv,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+) -> Lsn:
+    # TODO: it would be better to poll this in the compute, but there's no API for it. See:
+    # https://github.com/neondatabase/neon/issues/9758
+    "Wait for the given LSN to be committed on any Safekeeper"
+
+    max_commit_lsn = Lsn(0)
+    for i in range(1000):
+        for sk in env.safekeepers:
+            commit_lsn = sk.get_commit_lsn(tenant, timeline)
+            if commit_lsn >= lsn:
+                log.info(f"{tenant}/{timeline} at commit_lsn {commit_lsn}")
+                return commit_lsn
+            max_commit_lsn = max(max_commit_lsn, commit_lsn)
+
+        if i % 10 == 0:
+            log.info(
+                f"{tenant}/{timeline} waiting for commit_lsn to reach {lsn}, now {max_commit_lsn}"
+            )
+        time.sleep(0.1)
+    raise Exception(f"timed out while waiting for commit_lsn to reach {lsn}, was {max_commit_lsn}")
+
+
 def flush_ep_to_pageserver(
     env: NeonEnv,
     ep: Endpoint,
diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py
new file mode 100644
index 0000000000..d3118eb15a
--- /dev/null
+++ b/test_runner/performance/test_ingest_logical_message.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_commit_lsn,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("size", [1024, 8192, 131072])
+@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
+def test_ingest_logical_message(
+    request: pytest.FixtureRequest,
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    fsync: bool,
+    size: int,
+):
+    """
+    Benchmarks ingestion of 10 GB of logical message WAL. These are essentially noops, and don't
+    incur any pageserver writes.
+    """
+
+    VOLUME = 10 * 1024**3
+    count = VOLUME // size
+
+    neon_env_builder.safekeepers_enable_fsync = fsync
+
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            f"fsync = {fsync}",
+            # Disable backpressure. We don't want to block on pageserver.
+            "max_replication_apply_lag = 0",
+            "max_replication_flush_lag = 0",
+            "max_replication_write_lag = 0",
+        ],
+    )
+    client = env.pageserver.http_client()
+
+    # Wait for the timeline to be propagated to the pageserver.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Ingest data and measure durations.
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    with endpoint.cursor() as cur:
+        cur.execute("set statement_timeout = 0")
+
+        # Postgres will return once the logical messages have been written to its local WAL, without
+        # waiting for Safekeeper commit. We measure ingestion time both for Postgres, Safekeeper,
+        # and Pageserver to detect bottlenecks.
+        log.info("Ingesting data")
+        with zenbenchmark.record_duration("pageserver_ingest"):
+            with zenbenchmark.record_duration("safekeeper_ingest"):
+                with zenbenchmark.record_duration("postgres_ingest"):
+                    cur.execute(f"""
+                        select pg_logical_emit_message(false, '', repeat('x', {size}))
+                        from generate_series(1, {count})
+                    """)
+
+                    end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+                # Wait for Safekeeper.
+                log.info("Waiting for Safekeeper to catch up")
+                wait_for_commit_lsn(env, env.initial_tenant, env.initial_timeline, end_lsn)
+
+            # Wait for Pageserver.
+            log.info("Waiting for Pageserver to catch up")
+            wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+    # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
+    # reingest all the WAL from the safekeeper without any other constraints. This gives us a
+    # baseline of how fast the pageserver can ingest this WAL in isolation.
+    status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
+    assert status is not None
+
+    client.tenant_delete(env.initial_tenant)
+    env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
+
+    with zenbenchmark.record_duration("pageserver_recover_ingest"):
+        log.info("Recovering WAL into pageserver")
+        client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
+        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Emit metrics.
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+    zenbenchmark.record("message_count", count, "messages", MetricReport.TEST_PARAM)
+
+    props = {p["name"]: p["value"] for _, p in request.node.user_properties}
+    for name in ("postgres", "safekeeper", "pageserver", "pageserver_recover"):
+        throughput = int(wal_written_mb / props[f"{name}_ingest"])
+        zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)

From 1d642d6a57dd1cd1645a34aba5a2dd6e06a6c651 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 29 Nov 2024 11:08:01 +0000
Subject: [PATCH 009/117] chore(proxy): vendor a subset of rust-postgres
 (#9930)

Our rust-postgres fork is getting messy. Mostly because proxy wants more
control over the raw protocol than tokio-postgres provides. As such,
it's diverging more and more. Storage and compute also make use of
rust-postgres, but in more normal usage, thus they don't need our crazy
changes.

Idea:
* proxy maintains their subset
* other teams use a minimal patch set against upstream rust-postgres

Reviewing this code will be difficult. To implement it, I
1. Copied tokio-postgres, postgres-protocol and postgres-types from
https://github.com/neondatabase/rust-postgres/tree/00940fcdb57a8e99e805297b75839e7c4c7b1796
2. Updated their package names with the `2` suffix to make them compile
in the workspace.
3. Updated proxy to use those packages
4. Copied in the code from tokio-postgres-rustls 0.13 (with some patches
applied https://github.com/jbg/tokio-postgres-rustls/pull/32
https://github.com/jbg/tokio-postgres-rustls/pull/33)
5. Removed as much dead code as I could find in the vendored libraries
6. Updated the tokio-postgres-rustls code to use our existing channel
binding implementation
---
 .config/hakari.toml                           |    3 +
 Cargo.lock                                    |   56 +-
 Cargo.toml                                    |    3 +
 libs/proxy/README.md                          |    6 +
 libs/proxy/postgres-protocol2/Cargo.toml      |   21 +
 .../src/authentication/mod.rs                 |   37 +
 .../src/authentication/sasl.rs                |  516 +++++
 .../postgres-protocol2/src/escape/mod.rs      |   93 +
 .../postgres-protocol2/src/escape/test.rs     |   17 +
 libs/proxy/postgres-protocol2/src/lib.rs      |   78 +
 .../postgres-protocol2/src/message/backend.rs |  766 ++++++++
 .../src/message/frontend.rs                   |  297 +++
 .../postgres-protocol2/src/message/mod.rs     |    8 +
 .../postgres-protocol2/src/password/mod.rs    |  107 ++
 .../postgres-protocol2/src/password/test.rs   |   19 +
 .../proxy/postgres-protocol2/src/types/mod.rs |  294 +++
 .../postgres-protocol2/src/types/test.rs      |   87 +
 libs/proxy/postgres-types2/Cargo.toml         |   10 +
 libs/proxy/postgres-types2/src/lib.rs         |  477 +++++
 libs/proxy/postgres-types2/src/private.rs     |   34 +
 libs/proxy/postgres-types2/src/type_gen.rs    | 1524 +++++++++++++++
 libs/proxy/tokio-postgres2/Cargo.toml         |   21 +
 .../proxy/tokio-postgres2/src/cancel_query.rs |   40 +
 .../tokio-postgres2/src/cancel_query_raw.rs   |   29 +
 .../proxy/tokio-postgres2/src/cancel_token.rs |   62 +
 libs/proxy/tokio-postgres2/src/client.rs      |  439 +++++
 libs/proxy/tokio-postgres2/src/codec.rs       |  109 ++
 libs/proxy/tokio-postgres2/src/config.rs      |  897 +++++++++
 libs/proxy/tokio-postgres2/src/connect.rs     |  112 ++
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  359 ++++
 .../tokio-postgres2/src/connect_socket.rs     |   65 +
 libs/proxy/tokio-postgres2/src/connect_tls.rs |   48 +
 libs/proxy/tokio-postgres2/src/connection.rs  |  323 ++++
 libs/proxy/tokio-postgres2/src/error/mod.rs   |  501 +++++
 .../tokio-postgres2/src/error/sqlstate.rs     | 1670 +++++++++++++++++
 .../tokio-postgres2/src/generic_client.rs     |   64 +
 libs/proxy/tokio-postgres2/src/lib.rs         |  148 ++
 .../tokio-postgres2/src/maybe_tls_stream.rs   |   77 +
 libs/proxy/tokio-postgres2/src/prepare.rs     |  262 +++
 libs/proxy/tokio-postgres2/src/query.rs       |  340 ++++
 libs/proxy/tokio-postgres2/src/row.rs         |  300 +++
 .../proxy/tokio-postgres2/src/simple_query.rs |  142 ++
 libs/proxy/tokio-postgres2/src/statement.rs   |  157 ++
 libs/proxy/tokio-postgres2/src/tls.rs         |  162 ++
 .../proxy/tokio-postgres2/src/to_statement.rs |   57 +
 libs/proxy/tokio-postgres2/src/transaction.rs |   74 +
 .../src/transaction_builder.rs                |  113 ++
 libs/proxy/tokio-postgres2/src/types.rs       |    6 +
 proxy/Cargo.toml                              |    6 +-
 proxy/src/compute.rs                          |    5 +-
 proxy/src/context/mod.rs                      |    1 +
 proxy/src/lib.rs                              |    1 +
 proxy/src/postgres_rustls/mod.rs              |  158 ++
 proxy/src/proxy/tests/mod.rs                  |    2 +-
 proxy/src/serverless/backend.rs               |    2 +-
 proxy/src/serverless/conn_pool.rs             |    5 +-
 proxy/src/serverless/local_conn_pool.rs       |   11 +-
 workspace_hack/Cargo.toml                     |    4 +-
 58 files changed, 11199 insertions(+), 26 deletions(-)
 create mode 100644 libs/proxy/README.md
 create mode 100644 libs/proxy/postgres-protocol2/Cargo.toml
 create mode 100644 libs/proxy/postgres-protocol2/src/authentication/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/authentication/sasl.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/escape/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/escape/test.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/lib.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/backend.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/frontend.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/password/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/password/test.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/types/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/types/test.rs
 create mode 100644 libs/proxy/postgres-types2/Cargo.toml
 create mode 100644 libs/proxy/postgres-types2/src/lib.rs
 create mode 100644 libs/proxy/postgres-types2/src/private.rs
 create mode 100644 libs/proxy/postgres-types2/src/type_gen.rs
 create mode 100644 libs/proxy/tokio-postgres2/Cargo.toml
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_token.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/client.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/codec.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/config.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_raw.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_socket.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_tls.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connection.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/error/mod.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/error/sqlstate.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/generic_client.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/lib.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/prepare.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/row.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/simple_query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/statement.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/tls.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/to_statement.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/transaction.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/transaction_builder.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/types.rs
 create mode 100644 proxy/src/postgres_rustls/mod.rs

diff --git a/.config/hakari.toml b/.config/hakari.toml
index b5990d090e..3b6d9d8822 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -46,6 +46,9 @@ workspace-members = [
     "utils",
     "wal_craft",
     "walproposer",
+    "postgres-protocol2",
+    "postgres-types2",
+    "tokio-postgres2",
 ]
 
 # Write out exact versions rather than a semver range. (Defaults to false.)
diff --git a/Cargo.lock b/Cargo.lock
index 43a46fb1eb..f05c6311dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4162,6 +4162,23 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "postgres-protocol2"
+version = "0.1.0"
+dependencies = [
+ "base64 0.20.0",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+ "tokio",
+]
+
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
@@ -4170,8 +4187,15 @@ dependencies = [
  "bytes",
  "fallible-iterator",
  "postgres-protocol",
- "serde",
- "serde_json",
+]
+
+[[package]]
+name = "postgres-types2"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol2",
 ]
 
 [[package]]
@@ -4501,7 +4525,7 @@ dependencies = [
  "parquet_derive",
  "pbkdf2",
  "pin-project-lite",
- "postgres-protocol",
+ "postgres-protocol2",
  "postgres_backend",
  "pq_proto",
  "prometheus",
@@ -4536,8 +4560,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
+ "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite",
  "tokio-util",
@@ -6421,6 +6444,7 @@ dependencies = [
  "libc",
  "mio",
  "num_cpus",
+ "parking_lot 0.12.1",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
@@ -6502,6 +6526,26 @@ dependencies = [
  "x509-certificate",
 ]
 
+[[package]]
+name = "tokio-postgres2"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol2",
+ "postgres-types2",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.0"
@@ -7597,7 +7641,6 @@ dependencies = [
  "num-traits",
  "once_cell",
  "parquet",
- "postgres-types",
  "prettyplease",
  "proc-macro2",
  "prost",
@@ -7622,7 +7665,6 @@ dependencies = [
  "time",
  "time-macros",
  "tokio",
- "tokio-postgres",
  "tokio-rustls 0.26.0",
  "tokio-stream",
  "tokio-util",
diff --git a/Cargo.toml b/Cargo.toml
index e3dc5b97f8..742201d0f5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,9 @@ members = [
     "libs/walproposer",
     "libs/wal_decoder",
     "libs/postgres_initdb",
+    "libs/proxy/postgres-protocol2",
+    "libs/proxy/postgres-types2",
+    "libs/proxy/tokio-postgres2",
 ]
 
 [workspace.package]
diff --git a/libs/proxy/README.md b/libs/proxy/README.md
new file mode 100644
index 0000000000..2ae6210e46
--- /dev/null
+++ b/libs/proxy/README.md
@@ -0,0 +1,6 @@
+This directory contains libraries that are specific for proxy.
+
+Currently, it contains a signficant fork/refactoring of rust-postgres that no longer reflects the API
+of the original library. Since it was so significant, it made sense to upgrade it to it's own set of libraries.
+
+Proxy needs unique access to the protocol, which explains why such heavy modifications were necessary.
diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
new file mode 100644
index 0000000000..284a632954
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "postgres-protocol2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+base64 = "0.20"
+byteorder.workspace = true
+bytes.workspace = true
+fallible-iterator.workspace = true
+hmac.workspace = true
+md-5 = "0.10"
+memchr = "2.0"
+rand.workspace = true
+sha2.workspace = true
+stringprep = "0.1"
+tokio = { workspace = true, features = ["rt"] }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["full"] }
diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
new file mode 100644
index 0000000000..71afa4b9b6
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
@@ -0,0 +1,37 @@
+//! Authentication protocol support.
+use md5::{Digest, Md5};
+
+pub mod sasl;
+
+/// Hashes authentication information in a way suitable for use in response
+/// to an `AuthenticationMd5Password` message.
+///
+/// The resulting string should be sent back to the database in a
+/// `PasswordMessage` message.
+#[inline]
+pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
+    let mut md5 = Md5::new();
+    md5.update(password);
+    md5.update(username);
+    let output = md5.finalize_reset();
+    md5.update(format!("{:x}", output));
+    md5.update(salt);
+    format!("md5{:x}", md5.finalize())
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn md5() {
+        let username = b"md5_user";
+        let password = b"password";
+        let salt = [0x2a, 0x3d, 0x8f, 0xe0];
+
+        assert_eq!(
+            md5_hash(username, password, salt),
+            "md562af4dd09bbb41884907a838a3233294"
+        );
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
new file mode 100644
index 0000000000..19aa3c1e9a
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -0,0 +1,516 @@
+//! SASL-based authentication support.
+
+use hmac::{Hmac, Mac};
+use rand::{self, Rng};
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+use std::fmt::Write;
+use std::io;
+use std::iter;
+use std::mem;
+use std::str;
+use tokio::task::yield_now;
+
+const NONCE_LENGTH: usize = 24;
+
+/// The identifier of the SCRAM-SHA-256 SASL authentication mechanism.
+pub const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
+/// The identifier of the SCRAM-SHA-256-PLUS SASL authentication mechanism.
+pub const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
+
+// since postgres passwords are not required to exclude saslprep-prohibited
+// characters or even be valid UTF8, we run saslprep if possible and otherwise
+// return the raw password.
+fn normalize(pass: &[u8]) -> Vec<u8> {
+    let pass = match str::from_utf8(pass) {
+        Ok(pass) => pass,
+        Err(_) => return pass.to_vec(),
+    };
+
+    match stringprep::saslprep(pass) {
+        Ok(pass) => pass.into_owned().into_bytes(),
+        Err(_) => pass.as_bytes().to_vec(),
+    }
+}
+
+pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let mut hmac =
+        Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    hmac.update(salt);
+    hmac.update(&[0, 0, 0, 1]);
+    let mut prev = hmac.finalize().into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        let mut hmac = Hmac::<Sha256>::new_from_slice(str).expect("already checked above");
+        hmac.update(&prev);
+        prev = hmac.finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
+enum ChannelBindingInner {
+    Unrequested,
+    Unsupported,
+    TlsServerEndPoint(Vec<u8>),
+}
+
+/// The channel binding configuration for a SCRAM authentication exchange.
+pub struct ChannelBinding(ChannelBindingInner);
+
+impl ChannelBinding {
+    /// The server did not request channel binding.
+    pub fn unrequested() -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::Unrequested)
+    }
+
+    /// The server requested channel binding but the client is unable to provide it.
+    pub fn unsupported() -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::Unsupported)
+    }
+
+    /// The server requested channel binding and the client will use the `tls-server-end-point`
+    /// method.
+    pub fn tls_server_end_point(signature: Vec<u8>) -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::TlsServerEndPoint(signature))
+    }
+
+    fn gs2_header(&self) -> &'static str {
+        match self.0 {
+            ChannelBindingInner::Unrequested => "y,,",
+            ChannelBindingInner::Unsupported => "n,,",
+            ChannelBindingInner::TlsServerEndPoint(_) => "p=tls-server-end-point,,",
+        }
+    }
+
+    fn cbind_data(&self) -> &[u8] {
+        match self.0 {
+            ChannelBindingInner::Unrequested | ChannelBindingInner::Unsupported => &[],
+            ChannelBindingInner::TlsServerEndPoint(ref buf) => buf,
+        }
+    }
+}
+
+/// A pair of keys for the SCRAM-SHA-256 mechanism.
+/// See <https://datatracker.ietf.org/doc/html/rfc5802#section-3> for details.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ScramKeys<const N: usize> {
+    /// Used by server to authenticate client.
+    pub client_key: [u8; N],
+    /// Used by client to verify server's signature.
+    pub server_key: [u8; N],
+}
+
+/// Password or keys which were derived from it.
+enum Credentials<const N: usize> {
+    /// A regular password as a vector of bytes.
+    Password(Vec<u8>),
+    /// A precomputed pair of keys.
+    Keys(Box<ScramKeys<N>>),
+}
+
+enum State {
+    Update {
+        nonce: String,
+        password: Credentials<32>,
+        channel_binding: ChannelBinding,
+    },
+    Finish {
+        server_key: [u8; 32],
+        auth_message: String,
+    },
+    Done,
+}
+
+/// A type which handles the client side of the SCRAM-SHA-256/SCRAM-SHA-256-PLUS authentication
+/// process.
+///
+/// During the authentication process, if the backend sends an `AuthenticationSASL` message which
+/// includes `SCRAM-SHA-256` as an authentication mechanism, this type can be used.
+///
+/// After a `ScramSha256` is constructed, the buffer returned by the `message()` method should be
+/// sent to the backend in a `SASLInitialResponse` message along with the mechanism name.
+///
+/// The server will reply with an `AuthenticationSASLContinue` message. Its contents should be
+/// passed to the `update()` method, after which the buffer returned by the `message()` method
+/// should be sent to the backend in a `SASLResponse` message.
+///
+/// The server will reply with an `AuthenticationSASLFinal` message. Its contents should be passed
+/// to the `finish()` method, after which the authentication process is complete.
+pub struct ScramSha256 {
+    message: String,
+    state: State,
+}
+
+fn nonce() -> String {
+    // rand 0.5's ThreadRng is cryptographically secure
+    let mut rng = rand::thread_rng();
+    (0..NONCE_LENGTH)
+        .map(|_| {
+            let mut v = rng.gen_range(0x21u8..0x7e);
+            if v == 0x2c {
+                v = 0x7e
+            }
+            v as char
+        })
+        .collect()
+}
+
+impl ScramSha256 {
+    /// Constructs a new instance which will use the provided password for authentication.
+    pub fn new(password: &[u8], channel_binding: ChannelBinding) -> ScramSha256 {
+        let password = Credentials::Password(normalize(password));
+        ScramSha256::new_inner(password, channel_binding, nonce())
+    }
+
+    /// Constructs a new instance which will use the provided key pair for authentication.
+    pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
+        let password = Credentials::Keys(keys.into());
+        ScramSha256::new_inner(password, channel_binding, nonce())
+    }
+
+    fn new_inner(
+        password: Credentials<32>,
+        channel_binding: ChannelBinding,
+        nonce: String,
+    ) -> ScramSha256 {
+        ScramSha256 {
+            message: format!("{}n=,r={}", channel_binding.gs2_header(), nonce),
+            state: State::Update {
+                nonce,
+                password,
+                channel_binding,
+            },
+        }
+    }
+
+    /// Returns the message which should be sent to the backend in an `SASLResponse` message.
+    pub fn message(&self) -> &[u8] {
+        if let State::Done = self.state {
+            panic!("invalid SCRAM state");
+        }
+        self.message.as_bytes()
+    }
+
+    /// Updates the state machine with the response from the backend.
+    ///
+    /// This should be called when an `AuthenticationSASLContinue` message is received.
+    pub async fn update(&mut self, message: &[u8]) -> io::Result<()> {
+        let (client_nonce, password, channel_binding) =
+            match mem::replace(&mut self.state, State::Done) {
+                State::Update {
+                    nonce,
+                    password,
+                    channel_binding,
+                } => (nonce, password, channel_binding),
+                _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+            };
+
+        let message =
+            str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+
+        let parsed = Parser::new(message).server_first_message()?;
+
+        if !parsed.nonce.starts_with(&client_nonce) {
+            return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid nonce"));
+        }
+
+        let (client_key, server_key) = match password {
+            Credentials::Password(password) => {
+                let salt = match base64::decode(parsed.salt) {
+                    Ok(salt) => salt,
+                    Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
+                };
+
+                let salted_password = hi(&password, &salt, parsed.iteration_count).await;
+
+                let make_key = |name| {
+                    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+                        .expect("HMAC is able to accept all key sizes");
+                    hmac.update(name);
+
+                    let mut key = [0u8; 32];
+                    key.copy_from_slice(hmac.finalize().into_bytes().as_slice());
+                    key
+                };
+
+                (make_key(b"Client Key"), make_key(b"Server Key"))
+            }
+            Credentials::Keys(keys) => (keys.client_key, keys.server_key),
+        };
+
+        let mut hash = Sha256::default();
+        hash.update(client_key);
+        let stored_key = hash.finalize_fixed();
+
+        let mut cbind_input = vec![];
+        cbind_input.extend(channel_binding.gs2_header().as_bytes());
+        cbind_input.extend(channel_binding.cbind_data());
+        let cbind_input = base64::encode(&cbind_input);
+
+        self.message.clear();
+        write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap();
+
+        let auth_message = format!("n=,r={},{},{}", client_nonce, message, self.message);
+
+        let mut hmac = Hmac::<Sha256>::new_from_slice(&stored_key)
+            .expect("HMAC is able to accept all key sizes");
+        hmac.update(auth_message.as_bytes());
+        let client_signature = hmac.finalize().into_bytes();
+
+        let mut client_proof = client_key;
+        for (proof, signature) in client_proof.iter_mut().zip(client_signature) {
+            *proof ^= signature;
+        }
+
+        write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap();
+
+        self.state = State::Finish {
+            server_key,
+            auth_message,
+        };
+        Ok(())
+    }
+
+    /// Finalizes the authentication process.
+    ///
+    /// This should be called when the backend sends an `AuthenticationSASLFinal` message.
+    /// Authentication has only succeeded if this method returns `Ok(())`.
+    pub fn finish(&mut self, message: &[u8]) -> io::Result<()> {
+        let (server_key, auth_message) = match mem::replace(&mut self.state, State::Done) {
+            State::Finish {
+                server_key,
+                auth_message,
+            } => (server_key, auth_message),
+            _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+        };
+
+        let message =
+            str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+
+        let parsed = Parser::new(message).server_final_message()?;
+
+        let verifier = match parsed {
+            ServerFinalMessage::Error(e) => {
+                return Err(io::Error::new(
+                    io::ErrorKind::Other,
+                    format!("SCRAM error: {}", e),
+                ));
+            }
+            ServerFinalMessage::Verifier(verifier) => verifier,
+        };
+
+        let verifier = match base64::decode(verifier) {
+            Ok(verifier) => verifier,
+            Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
+        };
+
+        let mut hmac = Hmac::<Sha256>::new_from_slice(&server_key)
+            .expect("HMAC is able to accept all key sizes");
+        hmac.update(auth_message.as_bytes());
+        hmac.verify_slice(&verifier)
+            .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "SCRAM verification error"))
+    }
+}
+
+struct Parser<'a> {
+    s: &'a str,
+    it: iter::Peekable<str::CharIndices<'a>>,
+}
+
+impl<'a> Parser<'a> {
+    fn new(s: &'a str) -> Parser<'a> {
+        Parser {
+            s,
+            it: s.char_indices().peekable(),
+        }
+    }
+
+    fn eat(&mut self, target: char) -> io::Result<()> {
+        match self.it.next() {
+            Some((_, c)) if c == target => Ok(()),
+            Some((i, c)) => {
+                let m = format!(
+                    "unexpected character at byte {}: expected `{}` but got `{}",
+                    i, target, c
+                );
+                Err(io::Error::new(io::ErrorKind::InvalidInput, m))
+            }
+            None => Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "unexpected EOF",
+            )),
+        }
+    }
+
+    fn take_while<F>(&mut self, f: F) -> io::Result<&'a str>
+    where
+        F: Fn(char) -> bool,
+    {
+        let start = match self.it.peek() {
+            Some(&(i, _)) => i,
+            None => return Ok(""),
+        };
+
+        loop {
+            match self.it.peek() {
+                Some(&(_, c)) if f(c) => {
+                    self.it.next();
+                }
+                Some(&(i, _)) => return Ok(&self.s[start..i]),
+                None => return Ok(&self.s[start..]),
+            }
+        }
+    }
+
+    fn printable(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, '\x21'..='\x2b' | '\x2d'..='\x7e'))
+    }
+
+    fn nonce(&mut self) -> io::Result<&'a str> {
+        self.eat('r')?;
+        self.eat('=')?;
+        self.printable()
+    }
+
+    fn base64(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '+' | '='))
+    }
+
+    fn salt(&mut self) -> io::Result<&'a str> {
+        self.eat('s')?;
+        self.eat('=')?;
+        self.base64()
+    }
+
+    fn posit_number(&mut self) -> io::Result<u32> {
+        let n = self.take_while(|c| c.is_ascii_digit())?;
+        n.parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
+    }
+
+    fn iteration_count(&mut self) -> io::Result<u32> {
+        self.eat('i')?;
+        self.eat('=')?;
+        self.posit_number()
+    }
+
+    fn eof(&mut self) -> io::Result<()> {
+        match self.it.peek() {
+            Some(&(i, _)) => Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!("unexpected trailing data at byte {}", i),
+            )),
+            None => Ok(()),
+        }
+    }
+
+    fn server_first_message(&mut self) -> io::Result<ServerFirstMessage<'a>> {
+        let nonce = self.nonce()?;
+        self.eat(',')?;
+        let salt = self.salt()?;
+        self.eat(',')?;
+        let iteration_count = self.iteration_count()?;
+        self.eof()?;
+
+        Ok(ServerFirstMessage {
+            nonce,
+            salt,
+            iteration_count,
+        })
+    }
+
+    fn value(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, '\0' | '=' | ','))
+    }
+
+    fn server_error(&mut self) -> io::Result<Option<&'a str>> {
+        match self.it.peek() {
+            Some(&(_, 'e')) => {}
+            _ => return Ok(None),
+        }
+
+        self.eat('e')?;
+        self.eat('=')?;
+        self.value().map(Some)
+    }
+
+    fn verifier(&mut self) -> io::Result<&'a str> {
+        self.eat('v')?;
+        self.eat('=')?;
+        self.base64()
+    }
+
+    fn server_final_message(&mut self) -> io::Result<ServerFinalMessage<'a>> {
+        let message = match self.server_error()? {
+            Some(error) => ServerFinalMessage::Error(error),
+            None => ServerFinalMessage::Verifier(self.verifier()?),
+        };
+        self.eof()?;
+        Ok(message)
+    }
+}
+
+struct ServerFirstMessage<'a> {
+    nonce: &'a str,
+    salt: &'a str,
+    iteration_count: u32,
+}
+
+enum ServerFinalMessage<'a> {
+    Error(&'a str),
+    Verifier(&'a str),
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn parse_server_first_message() {
+        let message = "r=fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j,s=QSXCR+Q6sek8bf92,i=4096";
+        let message = Parser::new(message).server_first_message().unwrap();
+        assert_eq!(message.nonce, "fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j");
+        assert_eq!(message.salt, "QSXCR+Q6sek8bf92");
+        assert_eq!(message.iteration_count, 4096);
+    }
+
+    // recorded auth exchange from psql
+    #[tokio::test]
+    async fn exchange() {
+        let password = "foobar";
+        let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB";
+
+        let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB";
+        let server_first =
+            "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\
+             =4096";
+        let client_final =
+            "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\
+             1NTlQYNs5BTeQjdHdk7lOflDo5re2an8=";
+        let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw=";
+
+        let mut scram = ScramSha256::new_inner(
+            Credentials::Password(normalize(password.as_bytes())),
+            ChannelBinding::unsupported(),
+            nonce.to_string(),
+        );
+        assert_eq!(str::from_utf8(scram.message()).unwrap(), client_first);
+
+        scram.update(server_first.as_bytes()).await.unwrap();
+        assert_eq!(str::from_utf8(scram.message()).unwrap(), client_final);
+
+        scram.finish(server_final.as_bytes()).unwrap();
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/escape/mod.rs b/libs/proxy/postgres-protocol2/src/escape/mod.rs
new file mode 100644
index 0000000000..0ba7efdcac
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/escape/mod.rs
@@ -0,0 +1,93 @@
+//! Provides functions for escaping literals and identifiers for use
+//! in SQL queries.
+//!
+//! Prefer parameterized queries where possible. Do not escape
+//! parameters in a parameterized query.
+
+#[cfg(test)]
+mod test;
+
+/// Escape a literal and surround result with single quotes. Not
+/// recommended in most cases.
+///
+/// If input contains backslashes, result will be of the form `
+/// E'...'` so it is safe to use regardless of the setting of
+/// standard_conforming_strings.
+pub fn escape_literal(input: &str) -> String {
+    escape_internal(input, false)
+}
+
+/// Escape an identifier and surround result with double quotes.
+pub fn escape_identifier(input: &str) -> String {
+    escape_internal(input, true)
+}
+
+// Translation of PostgreSQL libpq's PQescapeInternal(). Does not
+// require a connection because input string is known to be valid
+// UTF-8.
+//
+// Escape arbitrary strings.  If as_ident is true, we escape the
+// result as an identifier; if false, as a literal.  The result is
+// returned in a newly allocated buffer.  If we fail due to an
+// encoding violation or out of memory condition, we return NULL,
+// storing an error message into conn.
+fn escape_internal(input: &str, as_ident: bool) -> String {
+    let mut num_backslashes = 0;
+    let mut num_quotes = 0;
+    let quote_char = if as_ident { '"' } else { '\'' };
+
+    // Scan the string for characters that must be escaped.
+    for ch in input.chars() {
+        if ch == quote_char {
+            num_quotes += 1;
+        } else if ch == '\\' {
+            num_backslashes += 1;
+        }
+    }
+
+    // Allocate output String.
+    let mut result_size = input.len() + num_quotes + 3; // two quotes, plus a NUL
+    if !as_ident && num_backslashes > 0 {
+        result_size += num_backslashes + 2;
+    }
+
+    let mut output = String::with_capacity(result_size);
+
+    // If we are escaping a literal that contains backslashes, we use
+    // the escape string syntax so that the result is correct under
+    // either value of standard_conforming_strings.  We also emit a
+    // leading space in this case, to guard against the possibility
+    // that the result might be interpolated immediately following an
+    // identifier.
+    if !as_ident && num_backslashes > 0 {
+        output.push(' ');
+        output.push('E');
+    }
+
+    // Opening quote.
+    output.push(quote_char);
+
+    // Use fast path if possible.
+    //
+    // We've already verified that the input string is well-formed in
+    // the current encoding.  If it contains no quotes and, in the
+    // case of literal-escaping, no backslashes, then we can just copy
+    // it directly to the output buffer, adding the necessary quotes.
+    //
+    // If not, we must rescan the input and process each character
+    // individually.
+    if num_quotes == 0 && (num_backslashes == 0 || as_ident) {
+        output.push_str(input);
+    } else {
+        for ch in input.chars() {
+            if ch == quote_char || (!as_ident && ch == '\\') {
+                output.push(ch);
+            }
+            output.push(ch);
+        }
+    }
+
+    output.push(quote_char);
+
+    output
+}
diff --git a/libs/proxy/postgres-protocol2/src/escape/test.rs b/libs/proxy/postgres-protocol2/src/escape/test.rs
new file mode 100644
index 0000000000..4816a103b7
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/escape/test.rs
@@ -0,0 +1,17 @@
+use crate::escape::{escape_identifier, escape_literal};
+
+#[test]
+fn test_escape_idenifier() {
+    assert_eq!(escape_identifier("foo"), String::from("\"foo\""));
+    assert_eq!(escape_identifier("f\\oo"), String::from("\"f\\oo\""));
+    assert_eq!(escape_identifier("f'oo"), String::from("\"f'oo\""));
+    assert_eq!(escape_identifier("f\"oo"), String::from("\"f\"\"oo\""));
+}
+
+#[test]
+fn test_escape_literal() {
+    assert_eq!(escape_literal("foo"), String::from("'foo'"));
+    assert_eq!(escape_literal("f\\oo"), String::from(" E'f\\\\oo'"));
+    assert_eq!(escape_literal("f'oo"), String::from("'f''oo'"));
+    assert_eq!(escape_literal("f\"oo"), String::from("'f\"oo'"));
+}
diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs
new file mode 100644
index 0000000000..947f2f835d
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/lib.rs
@@ -0,0 +1,78 @@
+//! Low level Postgres protocol APIs.
+//!
+//! This crate implements the low level components of Postgres's communication
+//! protocol, including message and value serialization and deserialization.
+//! It is designed to be used as a building block by higher level APIs such as
+//! `rust-postgres`, and should not typically be used directly.
+//!
+//! # Note
+//!
+//! This library assumes that the `client_encoding` backend parameter has been
+//! set to `UTF8`. It will most likely not behave properly if that is not the case.
+#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")]
+#![warn(missing_docs, rust_2018_idioms, clippy::all)]
+
+use byteorder::{BigEndian, ByteOrder};
+use bytes::{BufMut, BytesMut};
+use std::io;
+
+pub mod authentication;
+pub mod escape;
+pub mod message;
+pub mod password;
+pub mod types;
+
+/// A Postgres OID.
+pub type Oid = u32;
+
+/// A Postgres Log Sequence Number (LSN).
+pub type Lsn = u64;
+
+/// An enum indicating if a value is `NULL` or not.
+pub enum IsNull {
+    /// The value is `NULL`.
+    Yes,
+    /// The value is not `NULL`.
+    No,
+}
+
+fn write_nullable<F, E>(serializer: F, buf: &mut BytesMut) -> Result<(), E>
+where
+    F: FnOnce(&mut BytesMut) -> Result<IsNull, E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.put_i32(0);
+    let size = match serializer(buf)? {
+        IsNull::No => i32::from_usize(buf.len() - base - 4)?,
+        IsNull::Yes => -1,
+    };
+    BigEndian::write_i32(&mut buf[base..], size);
+
+    Ok(())
+}
+
+trait FromUsize: Sized {
+    fn from_usize(x: usize) -> Result<Self, io::Error>;
+}
+
+macro_rules! from_usize {
+    ($t:ty) => {
+        impl FromUsize for $t {
+            #[inline]
+            fn from_usize(x: usize) -> io::Result<$t> {
+                if x > <$t>::MAX as usize {
+                    Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        "value too large to transmit",
+                    ))
+                } else {
+                    Ok(x as $t)
+                }
+            }
+        }
+    };
+}
+
+from_usize!(i16);
+from_usize!(i32);
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
new file mode 100644
index 0000000000..356d142f3f
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -0,0 +1,766 @@
+#![allow(missing_docs)]
+
+use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
+use bytes::{Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use memchr::memchr;
+use std::cmp;
+use std::io::{self, Read};
+use std::ops::Range;
+use std::str;
+
+use crate::Oid;
+
+// top-level message tags
+const PARSE_COMPLETE_TAG: u8 = b'1';
+const BIND_COMPLETE_TAG: u8 = b'2';
+const CLOSE_COMPLETE_TAG: u8 = b'3';
+pub const NOTIFICATION_RESPONSE_TAG: u8 = b'A';
+const COPY_DONE_TAG: u8 = b'c';
+const COMMAND_COMPLETE_TAG: u8 = b'C';
+const COPY_DATA_TAG: u8 = b'd';
+const DATA_ROW_TAG: u8 = b'D';
+const ERROR_RESPONSE_TAG: u8 = b'E';
+const COPY_IN_RESPONSE_TAG: u8 = b'G';
+const COPY_OUT_RESPONSE_TAG: u8 = b'H';
+const COPY_BOTH_RESPONSE_TAG: u8 = b'W';
+const EMPTY_QUERY_RESPONSE_TAG: u8 = b'I';
+const BACKEND_KEY_DATA_TAG: u8 = b'K';
+pub const NO_DATA_TAG: u8 = b'n';
+pub const NOTICE_RESPONSE_TAG: u8 = b'N';
+const AUTHENTICATION_TAG: u8 = b'R';
+const PORTAL_SUSPENDED_TAG: u8 = b's';
+pub const PARAMETER_STATUS_TAG: u8 = b'S';
+const PARAMETER_DESCRIPTION_TAG: u8 = b't';
+const ROW_DESCRIPTION_TAG: u8 = b'T';
+pub const READY_FOR_QUERY_TAG: u8 = b'Z';
+
+#[derive(Debug, Copy, Clone)]
+pub struct Header {
+    tag: u8,
+    len: i32,
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl Header {
+    #[inline]
+    pub fn parse(buf: &[u8]) -> io::Result<Option<Header>> {
+        if buf.len() < 5 {
+            return Ok(None);
+        }
+
+        let tag = buf[0];
+        let len = BigEndian::read_i32(&buf[1..]);
+
+        if len < 4 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "invalid message length: header length < 4",
+            ));
+        }
+
+        Ok(Some(Header { tag, len }))
+    }
+
+    #[inline]
+    pub fn tag(self) -> u8 {
+        self.tag
+    }
+
+    #[inline]
+    pub fn len(self) -> i32 {
+        self.len
+    }
+}
+
+/// An enum representing Postgres backend messages.
+#[non_exhaustive]
+pub enum Message {
+    AuthenticationCleartextPassword,
+    AuthenticationGss,
+    AuthenticationKerberosV5,
+    AuthenticationMd5Password(AuthenticationMd5PasswordBody),
+    AuthenticationOk,
+    AuthenticationScmCredential,
+    AuthenticationSspi,
+    AuthenticationGssContinue,
+    AuthenticationSasl(AuthenticationSaslBody),
+    AuthenticationSaslContinue(AuthenticationSaslContinueBody),
+    AuthenticationSaslFinal(AuthenticationSaslFinalBody),
+    BackendKeyData(BackendKeyDataBody),
+    BindComplete,
+    CloseComplete,
+    CommandComplete(CommandCompleteBody),
+    CopyData,
+    CopyDone,
+    CopyInResponse,
+    CopyOutResponse,
+    CopyBothResponse,
+    DataRow(DataRowBody),
+    EmptyQueryResponse,
+    ErrorResponse(ErrorResponseBody),
+    NoData,
+    NoticeResponse(NoticeResponseBody),
+    NotificationResponse(NotificationResponseBody),
+    ParameterDescription(ParameterDescriptionBody),
+    ParameterStatus(ParameterStatusBody),
+    ParseComplete,
+    PortalSuspended,
+    ReadyForQuery(ReadyForQueryBody),
+    RowDescription(RowDescriptionBody),
+}
+
+impl Message {
+    #[inline]
+    pub fn parse(buf: &mut BytesMut) -> io::Result<Option<Message>> {
+        if buf.len() < 5 {
+            let to_read = 5 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        let tag = buf[0];
+        let len = (&buf[1..5]).read_u32::<BigEndian>().unwrap();
+
+        if len < 4 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "invalid message length: parsing u32",
+            ));
+        }
+
+        let total_len = len as usize + 1;
+        if buf.len() < total_len {
+            let to_read = total_len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        let mut buf = Buffer {
+            bytes: buf.split_to(total_len).freeze(),
+            idx: 5,
+        };
+
+        let message = match tag {
+            PARSE_COMPLETE_TAG => Message::ParseComplete,
+            BIND_COMPLETE_TAG => Message::BindComplete,
+            CLOSE_COMPLETE_TAG => Message::CloseComplete,
+            NOTIFICATION_RESPONSE_TAG => {
+                let process_id = buf.read_i32::<BigEndian>()?;
+                let channel = buf.read_cstr()?;
+                let message = buf.read_cstr()?;
+                Message::NotificationResponse(NotificationResponseBody {
+                    process_id,
+                    channel,
+                    message,
+                })
+            }
+            COPY_DONE_TAG => Message::CopyDone,
+            COMMAND_COMPLETE_TAG => {
+                let tag = buf.read_cstr()?;
+                Message::CommandComplete(CommandCompleteBody { tag })
+            }
+            COPY_DATA_TAG => Message::CopyData,
+            DATA_ROW_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::DataRow(DataRowBody { storage, len })
+            }
+            ERROR_RESPONSE_TAG => {
+                let storage = buf.read_all();
+                Message::ErrorResponse(ErrorResponseBody { storage })
+            }
+            COPY_IN_RESPONSE_TAG => Message::CopyInResponse,
+            COPY_OUT_RESPONSE_TAG => Message::CopyOutResponse,
+            COPY_BOTH_RESPONSE_TAG => Message::CopyBothResponse,
+            EMPTY_QUERY_RESPONSE_TAG => Message::EmptyQueryResponse,
+            BACKEND_KEY_DATA_TAG => {
+                let process_id = buf.read_i32::<BigEndian>()?;
+                let secret_key = buf.read_i32::<BigEndian>()?;
+                Message::BackendKeyData(BackendKeyDataBody {
+                    process_id,
+                    secret_key,
+                })
+            }
+            NO_DATA_TAG => Message::NoData,
+            NOTICE_RESPONSE_TAG => {
+                let storage = buf.read_all();
+                Message::NoticeResponse(NoticeResponseBody { storage })
+            }
+            AUTHENTICATION_TAG => match buf.read_i32::<BigEndian>()? {
+                0 => Message::AuthenticationOk,
+                2 => Message::AuthenticationKerberosV5,
+                3 => Message::AuthenticationCleartextPassword,
+                5 => {
+                    let mut salt = [0; 4];
+                    buf.read_exact(&mut salt)?;
+                    Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
+                }
+                6 => Message::AuthenticationScmCredential,
+                7 => Message::AuthenticationGss,
+                8 => Message::AuthenticationGssContinue,
+                9 => Message::AuthenticationSspi,
+                10 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSasl(AuthenticationSaslBody(storage))
+                }
+                11 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSaslContinue(AuthenticationSaslContinueBody(storage))
+                }
+                12 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSaslFinal(AuthenticationSaslFinalBody(storage))
+                }
+                tag => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        format!("unknown authentication tag `{}`", tag),
+                    ));
+                }
+            },
+            PORTAL_SUSPENDED_TAG => Message::PortalSuspended,
+            PARAMETER_STATUS_TAG => {
+                let name = buf.read_cstr()?;
+                let value = buf.read_cstr()?;
+                Message::ParameterStatus(ParameterStatusBody { name, value })
+            }
+            PARAMETER_DESCRIPTION_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::ParameterDescription(ParameterDescriptionBody { storage, len })
+            }
+            ROW_DESCRIPTION_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::RowDescription(RowDescriptionBody { storage, len })
+            }
+            READY_FOR_QUERY_TAG => {
+                let status = buf.read_u8()?;
+                Message::ReadyForQuery(ReadyForQueryBody { status })
+            }
+            tag => {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    format!("unknown message tag `{}`", tag),
+                ));
+            }
+        };
+
+        if !buf.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "invalid message length: expected buffer to be empty",
+            ));
+        }
+
+        Ok(Some(message))
+    }
+}
+
+struct Buffer {
+    bytes: Bytes,
+    idx: usize,
+}
+
+impl Buffer {
+    #[inline]
+    fn slice(&self) -> &[u8] {
+        &self.bytes[self.idx..]
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.slice().is_empty()
+    }
+
+    #[inline]
+    fn read_cstr(&mut self) -> io::Result<Bytes> {
+        match memchr(0, self.slice()) {
+            Some(pos) => {
+                let start = self.idx;
+                let end = start + pos;
+                let cstr = self.bytes.slice(start..end);
+                self.idx = end + 1;
+                Ok(cstr)
+            }
+            None => Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "unexpected EOF",
+            )),
+        }
+    }
+
+    #[inline]
+    fn read_all(&mut self) -> Bytes {
+        let buf = self.bytes.slice(self.idx..);
+        self.idx = self.bytes.len();
+        buf
+    }
+}
+
+impl Read for Buffer {
+    #[inline]
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let len = {
+            let slice = self.slice();
+            let len = cmp::min(slice.len(), buf.len());
+            buf[..len].copy_from_slice(&slice[..len]);
+            len
+        };
+        self.idx += len;
+        Ok(len)
+    }
+}
+
+pub struct AuthenticationMd5PasswordBody {
+    salt: [u8; 4],
+}
+
+impl AuthenticationMd5PasswordBody {
+    #[inline]
+    pub fn salt(&self) -> [u8; 4] {
+        self.salt
+    }
+}
+
+pub struct AuthenticationSaslBody(Bytes);
+
+impl AuthenticationSaslBody {
+    #[inline]
+    pub fn mechanisms(&self) -> SaslMechanisms<'_> {
+        SaslMechanisms(&self.0)
+    }
+}
+
+pub struct SaslMechanisms<'a>(&'a [u8]);
+
+impl<'a> FallibleIterator for SaslMechanisms<'a> {
+    type Item = &'a str;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<&'a str>> {
+        let value_end = find_null(self.0, 0)?;
+        if value_end == 0 {
+            if self.0.len() != 1 {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    "invalid message length: expected to be at end of iterator for sasl",
+                ));
+            }
+            Ok(None)
+        } else {
+            let value = get_str(&self.0[..value_end])?;
+            self.0 = &self.0[value_end + 1..];
+            Ok(Some(value))
+        }
+    }
+}
+
+pub struct AuthenticationSaslContinueBody(Bytes);
+
+impl AuthenticationSaslContinueBody {
+    #[inline]
+    pub fn data(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+pub struct AuthenticationSaslFinalBody(Bytes);
+
+impl AuthenticationSaslFinalBody {
+    #[inline]
+    pub fn data(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+pub struct BackendKeyDataBody {
+    process_id: i32,
+    secret_key: i32,
+}
+
+impl BackendKeyDataBody {
+    #[inline]
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    #[inline]
+    pub fn secret_key(&self) -> i32 {
+        self.secret_key
+    }
+}
+
+pub struct CommandCompleteBody {
+    tag: Bytes,
+}
+
+impl CommandCompleteBody {
+    #[inline]
+    pub fn tag(&self) -> io::Result<&str> {
+        get_str(&self.tag)
+    }
+}
+
+#[derive(Debug)]
+pub struct DataRowBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl DataRowBody {
+    #[inline]
+    pub fn ranges(&self) -> DataRowRanges<'_> {
+        DataRowRanges {
+            buf: &self.storage,
+            len: self.storage.len(),
+            remaining: self.len,
+        }
+    }
+
+    #[inline]
+    pub fn buffer(&self) -> &[u8] {
+        &self.storage
+    }
+}
+
+pub struct DataRowRanges<'a> {
+    buf: &'a [u8],
+    len: usize,
+    remaining: u16,
+}
+
+impl FallibleIterator for DataRowRanges<'_> {
+    type Item = Option<Range<usize>>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Option<Range<usize>>>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: datarowrange is not empty",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        let len = self.buf.read_i32::<BigEndian>()?;
+        if len < 0 {
+            Ok(Some(None))
+        } else {
+            let len = len as usize;
+            if self.buf.len() < len {
+                return Err(io::Error::new(
+                    io::ErrorKind::UnexpectedEof,
+                    "unexpected EOF",
+                ));
+            }
+            let base = self.len - self.buf.len();
+            self.buf = &self.buf[len..];
+            Ok(Some(Some(base..base + len)))
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+pub struct ErrorResponseBody {
+    storage: Bytes,
+}
+
+impl ErrorResponseBody {
+    #[inline]
+    pub fn fields(&self) -> ErrorFields<'_> {
+        ErrorFields { buf: &self.storage }
+    }
+}
+
+pub struct ErrorFields<'a> {
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for ErrorFields<'a> {
+    type Item = ErrorField<'a>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<ErrorField<'a>>> {
+        let type_ = self.buf.read_u8()?;
+        if type_ == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: error fields is not drained",
+                ));
+            }
+        }
+
+        let value_end = find_null(self.buf, 0)?;
+        let value = get_str(&self.buf[..value_end])?;
+        self.buf = &self.buf[value_end + 1..];
+
+        Ok(Some(ErrorField { type_, value }))
+    }
+}
+
+pub struct ErrorField<'a> {
+    type_: u8,
+    value: &'a str,
+}
+
+impl ErrorField<'_> {
+    #[inline]
+    pub fn type_(&self) -> u8 {
+        self.type_
+    }
+
+    #[inline]
+    pub fn value(&self) -> &str {
+        self.value
+    }
+}
+
+pub struct NoticeResponseBody {
+    storage: Bytes,
+}
+
+impl NoticeResponseBody {
+    #[inline]
+    pub fn fields(&self) -> ErrorFields<'_> {
+        ErrorFields { buf: &self.storage }
+    }
+}
+
+pub struct NotificationResponseBody {
+    process_id: i32,
+    channel: Bytes,
+    message: Bytes,
+}
+
+impl NotificationResponseBody {
+    #[inline]
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    #[inline]
+    pub fn channel(&self) -> io::Result<&str> {
+        get_str(&self.channel)
+    }
+
+    #[inline]
+    pub fn message(&self) -> io::Result<&str> {
+        get_str(&self.message)
+    }
+}
+
+pub struct ParameterDescriptionBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl ParameterDescriptionBody {
+    #[inline]
+    pub fn parameters(&self) -> Parameters<'_> {
+        Parameters {
+            buf: &self.storage,
+            remaining: self.len,
+        }
+    }
+}
+
+pub struct Parameters<'a> {
+    buf: &'a [u8],
+    remaining: u16,
+}
+
+impl FallibleIterator for Parameters<'_> {
+    type Item = Oid;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Oid>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: parameters is not drained",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        self.buf.read_u32::<BigEndian>().map(Some)
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+pub struct ParameterStatusBody {
+    name: Bytes,
+    value: Bytes,
+}
+
+impl ParameterStatusBody {
+    #[inline]
+    pub fn name(&self) -> io::Result<&str> {
+        get_str(&self.name)
+    }
+
+    #[inline]
+    pub fn value(&self) -> io::Result<&str> {
+        get_str(&self.value)
+    }
+}
+
+pub struct ReadyForQueryBody {
+    status: u8,
+}
+
+impl ReadyForQueryBody {
+    #[inline]
+    pub fn status(&self) -> u8 {
+        self.status
+    }
+}
+
+pub struct RowDescriptionBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl RowDescriptionBody {
+    #[inline]
+    pub fn fields(&self) -> Fields<'_> {
+        Fields {
+            buf: &self.storage,
+            remaining: self.len,
+        }
+    }
+}
+
+pub struct Fields<'a> {
+    buf: &'a [u8],
+    remaining: u16,
+}
+
+impl<'a> FallibleIterator for Fields<'a> {
+    type Item = Field<'a>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Field<'a>>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: field is not drained",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        let name_end = find_null(self.buf, 0)?;
+        let name = get_str(&self.buf[..name_end])?;
+        self.buf = &self.buf[name_end + 1..];
+        let table_oid = self.buf.read_u32::<BigEndian>()?;
+        let column_id = self.buf.read_i16::<BigEndian>()?;
+        let type_oid = self.buf.read_u32::<BigEndian>()?;
+        let type_size = self.buf.read_i16::<BigEndian>()?;
+        let type_modifier = self.buf.read_i32::<BigEndian>()?;
+        let format = self.buf.read_i16::<BigEndian>()?;
+
+        Ok(Some(Field {
+            name,
+            table_oid,
+            column_id,
+            type_oid,
+            type_size,
+            type_modifier,
+            format,
+        }))
+    }
+}
+
+pub struct Field<'a> {
+    name: &'a str,
+    table_oid: Oid,
+    column_id: i16,
+    type_oid: Oid,
+    type_size: i16,
+    type_modifier: i32,
+    format: i16,
+}
+
+impl<'a> Field<'a> {
+    #[inline]
+    pub fn name(&self) -> &'a str {
+        self.name
+    }
+
+    #[inline]
+    pub fn table_oid(&self) -> Oid {
+        self.table_oid
+    }
+
+    #[inline]
+    pub fn column_id(&self) -> i16 {
+        self.column_id
+    }
+
+    #[inline]
+    pub fn type_oid(&self) -> Oid {
+        self.type_oid
+    }
+
+    #[inline]
+    pub fn type_size(&self) -> i16 {
+        self.type_size
+    }
+
+    #[inline]
+    pub fn type_modifier(&self) -> i32 {
+        self.type_modifier
+    }
+
+    #[inline]
+    pub fn format(&self) -> i16 {
+        self.format
+    }
+}
+
+#[inline]
+fn find_null(buf: &[u8], start: usize) -> io::Result<usize> {
+    match memchr(0, &buf[start..]) {
+        Some(pos) => Ok(pos + start),
+        None => Err(io::Error::new(
+            io::ErrorKind::UnexpectedEof,
+            "unexpected EOF",
+        )),
+    }
+}
+
+#[inline]
+fn get_str(buf: &[u8]) -> io::Result<&str> {
+    str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
+}
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
new file mode 100644
index 0000000000..5d0a8ff8c8
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -0,0 +1,297 @@
+//! Frontend message serialization.
+#![allow(missing_docs)]
+
+use byteorder::{BigEndian, ByteOrder};
+use bytes::{Buf, BufMut, BytesMut};
+use std::convert::TryFrom;
+use std::error::Error;
+use std::io;
+use std::marker;
+
+use crate::{write_nullable, FromUsize, IsNull, Oid};
+
+#[inline]
+fn write_body<F, E>(buf: &mut BytesMut, f: F) -> Result<(), E>
+where
+    F: FnOnce(&mut BytesMut) -> Result<(), E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.extend_from_slice(&[0; 4]);
+
+    f(buf)?;
+
+    let size = i32::from_usize(buf.len() - base)?;
+    BigEndian::write_i32(&mut buf[base..], size);
+    Ok(())
+}
+
+pub enum BindError {
+    Conversion(Box<dyn Error + marker::Sync + Send>),
+    Serialization(io::Error),
+}
+
+impl From<Box<dyn Error + marker::Sync + Send>> for BindError {
+    #[inline]
+    fn from(e: Box<dyn Error + marker::Sync + Send>) -> BindError {
+        BindError::Conversion(e)
+    }
+}
+
+impl From<io::Error> for BindError {
+    #[inline]
+    fn from(e: io::Error) -> BindError {
+        BindError::Serialization(e)
+    }
+}
+
+#[inline]
+pub fn bind<I, J, F, T, K>(
+    portal: &str,
+    statement: &str,
+    formats: I,
+    values: J,
+    mut serializer: F,
+    result_formats: K,
+    buf: &mut BytesMut,
+) -> Result<(), BindError>
+where
+    I: IntoIterator<Item = i16>,
+    J: IntoIterator<Item = T>,
+    F: FnMut(T, &mut BytesMut) -> Result<IsNull, Box<dyn Error + marker::Sync + Send>>,
+    K: IntoIterator<Item = i16>,
+{
+    buf.put_u8(b'B');
+
+    write_body(buf, |buf| {
+        write_cstr(portal.as_bytes(), buf)?;
+        write_cstr(statement.as_bytes(), buf)?;
+        write_counted(
+            formats,
+            |f, buf| {
+                buf.put_i16(f);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+        write_counted(
+            values,
+            |v, buf| write_nullable(|buf| serializer(v, buf), buf),
+            buf,
+        )?;
+        write_counted(
+            result_formats,
+            |f, buf| {
+                buf.put_i16(f);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+
+        Ok(())
+    })
+}
+
+#[inline]
+fn write_counted<I, T, F, E>(items: I, mut serializer: F, buf: &mut BytesMut) -> Result<(), E>
+where
+    I: IntoIterator<Item = T>,
+    F: FnMut(T, &mut BytesMut) -> Result<(), E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.extend_from_slice(&[0; 2]);
+    let mut count = 0;
+    for item in items {
+        serializer(item, buf)?;
+        count += 1;
+    }
+    let count = i16::from_usize(count)?;
+    BigEndian::write_i16(&mut buf[base..], count);
+
+    Ok(())
+}
+
+#[inline]
+pub fn cancel_request(process_id: i32, secret_key: i32, buf: &mut BytesMut) {
+    write_body(buf, |buf| {
+        buf.put_i32(80_877_102);
+        buf.put_i32(process_id);
+        buf.put_i32(secret_key);
+        Ok::<_, io::Error>(())
+    })
+    .unwrap();
+}
+
+#[inline]
+pub fn close(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'C');
+    write_body(buf, |buf| {
+        buf.put_u8(variant);
+        write_cstr(name.as_bytes(), buf)
+    })
+}
+
+pub struct CopyData<T> {
+    buf: T,
+    len: i32,
+}
+
+impl<T> CopyData<T>
+where
+    T: Buf,
+{
+    pub fn new(buf: T) -> io::Result<CopyData<T>> {
+        let len = buf
+            .remaining()
+            .checked_add(4)
+            .and_then(|l| i32::try_from(l).ok())
+            .ok_or_else(|| {
+                io::Error::new(io::ErrorKind::InvalidInput, "message length overflow")
+            })?;
+
+        Ok(CopyData { buf, len })
+    }
+
+    pub fn write(self, out: &mut BytesMut) {
+        out.put_u8(b'd');
+        out.put_i32(self.len);
+        out.put(self.buf);
+    }
+}
+
+#[inline]
+pub fn copy_done(buf: &mut BytesMut) {
+    buf.put_u8(b'c');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+pub fn copy_fail(message: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'f');
+    write_body(buf, |buf| write_cstr(message.as_bytes(), buf))
+}
+
+#[inline]
+pub fn describe(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'D');
+    write_body(buf, |buf| {
+        buf.put_u8(variant);
+        write_cstr(name.as_bytes(), buf)
+    })
+}
+
+#[inline]
+pub fn execute(portal: &str, max_rows: i32, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'E');
+    write_body(buf, |buf| {
+        write_cstr(portal.as_bytes(), buf)?;
+        buf.put_i32(max_rows);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn parse<I>(name: &str, query: &str, param_types: I, buf: &mut BytesMut) -> io::Result<()>
+where
+    I: IntoIterator<Item = Oid>,
+{
+    buf.put_u8(b'P');
+    write_body(buf, |buf| {
+        write_cstr(name.as_bytes(), buf)?;
+        write_cstr(query.as_bytes(), buf)?;
+        write_counted(
+            param_types,
+            |t, buf| {
+                buf.put_u32(t);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn password_message(password: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| write_cstr(password, buf))
+}
+
+#[inline]
+pub fn query(query: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'Q');
+    write_body(buf, |buf| write_cstr(query.as_bytes(), buf))
+}
+
+#[inline]
+pub fn sasl_initial_response(mechanism: &str, data: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| {
+        write_cstr(mechanism.as_bytes(), buf)?;
+        let len = i32::from_usize(data.len())?;
+        buf.put_i32(len);
+        buf.put_slice(data);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn sasl_response(data: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| {
+        buf.put_slice(data);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn ssl_request(buf: &mut BytesMut) {
+    write_body(buf, |buf| {
+        buf.put_i32(80_877_103);
+        Ok::<_, io::Error>(())
+    })
+    .unwrap();
+}
+
+#[inline]
+pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
+where
+    I: IntoIterator<Item = (&'a str, &'a str)>,
+{
+    write_body(buf, |buf| {
+        // postgres protocol version 3.0(196608) in bigger-endian
+        buf.put_i32(0x00_03_00_00);
+        for (key, value) in parameters {
+            write_cstr(key.as_bytes(), buf)?;
+            write_cstr(value.as_bytes(), buf)?;
+        }
+        buf.put_u8(0);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn sync(buf: &mut BytesMut) {
+    buf.put_u8(b'S');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+pub fn terminate(buf: &mut BytesMut) {
+    buf.put_u8(b'X');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
+    if s.contains(&0) {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidInput,
+            "string contains embedded null",
+        ));
+    }
+    buf.put_slice(s);
+    buf.put_u8(0);
+    Ok(())
+}
diff --git a/libs/proxy/postgres-protocol2/src/message/mod.rs b/libs/proxy/postgres-protocol2/src/message/mod.rs
new file mode 100644
index 0000000000..9e5d997548
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/mod.rs
@@ -0,0 +1,8 @@
+//! Postgres message protocol support.
+//!
+//! See [Postgres's documentation][docs] for more information on message flow.
+//!
+//! [docs]: https://www.postgresql.org/docs/9.5/static/protocol-flow.html
+
+pub mod backend;
+pub mod frontend;
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
new file mode 100644
index 0000000000..e669e80f3f
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -0,0 +1,107 @@
+//! Functions to encrypt a password in the client.
+//!
+//! This is intended to be used by client applications that wish to
+//! send commands like `ALTER USER joe PASSWORD 'pwd'`. The password
+//! need not be sent in cleartext if it is encrypted on the client
+//! side. This is good because it ensures the cleartext password won't
+//! end up in logs pg_stat displays, etc.
+
+use crate::authentication::sasl;
+use hmac::{Hmac, Mac};
+use md5::Md5;
+use rand::RngCore;
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+
+#[cfg(test)]
+mod test;
+
+const SCRAM_DEFAULT_ITERATIONS: u32 = 4096;
+const SCRAM_DEFAULT_SALT_LEN: usize = 16;
+
+/// Hash password using SCRAM-SHA-256 with a randomly-generated
+/// salt.
+///
+/// The client may assume the returned string doesn't contain any
+/// special characters that would require escaping in an SQL command.
+pub async fn scram_sha_256(password: &[u8]) -> String {
+    let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN];
+    let mut rng = rand::thread_rng();
+    rng.fill_bytes(&mut salt);
+    scram_sha_256_salt(password, salt).await
+}
+
+// Internal implementation of scram_sha_256 with a caller-provided
+// salt. This is useful for testing.
+pub(crate) async fn scram_sha_256_salt(
+    password: &[u8],
+    salt: [u8; SCRAM_DEFAULT_SALT_LEN],
+) -> String {
+    // Prepare the password, per [RFC
+    // 4013](https://tools.ietf.org/html/rfc4013), if possible.
+    //
+    // Postgres treats passwords as byte strings (without embedded NUL
+    // bytes), but SASL expects passwords to be valid UTF-8.
+    //
+    // Follow the behavior of libpq's PQencryptPasswordConn(), and
+    // also the backend. If the password is not valid UTF-8, or if it
+    // contains prohibited characters (such as non-ASCII whitespace),
+    // just skip the SASLprep step and use the original byte
+    // sequence.
+    let prepared: Vec<u8> = match std::str::from_utf8(password) {
+        Ok(password_str) => {
+            match stringprep::saslprep(password_str) {
+                Ok(p) => p.into_owned().into_bytes(),
+                // contains invalid characters; skip saslprep
+                Err(_) => Vec::from(password),
+            }
+        }
+        // not valid UTF-8; skip saslprep
+        Err(_) => Vec::from(password),
+    };
+
+    // salt password
+    let salted_password = sasl::hi(&prepared, &salt, SCRAM_DEFAULT_ITERATIONS).await;
+
+    // client key
+    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+        .expect("HMAC is able to accept all key sizes");
+    hmac.update(b"Client Key");
+    let client_key = hmac.finalize().into_bytes();
+
+    // stored key
+    let mut hash = Sha256::default();
+    hash.update(client_key.as_slice());
+    let stored_key = hash.finalize_fixed();
+
+    // server key
+    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+        .expect("HMAC is able to accept all key sizes");
+    hmac.update(b"Server Key");
+    let server_key = hmac.finalize().into_bytes();
+
+    format!(
+        "SCRAM-SHA-256${}:{}${}:{}",
+        SCRAM_DEFAULT_ITERATIONS,
+        base64::encode(salt),
+        base64::encode(stored_key),
+        base64::encode(server_key)
+    )
+}
+
+/// **Not recommended, as MD5 is not considered to be secure.**
+///
+/// Hash password using MD5 with the username as the salt.
+///
+/// The client may assume the returned string doesn't contain any
+/// special characters that would require escaping.
+pub fn md5(password: &[u8], username: &str) -> String {
+    // salt password with username
+    let mut salted_password = Vec::from(password);
+    salted_password.extend_from_slice(username.as_bytes());
+
+    let mut hash = Md5::new();
+    hash.update(&salted_password);
+    let digest = hash.finalize();
+    format!("md5{:x}", digest)
+}
diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs
new file mode 100644
index 0000000000..c9d340f09d
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/password/test.rs
@@ -0,0 +1,19 @@
+use crate::password;
+
+#[tokio::test]
+async fn test_encrypt_scram_sha_256() {
+    // Specify the salt to make the test deterministic. Any bytes will do.
+    let salt: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    assert_eq!(
+        password::scram_sha_256_salt(b"secret", salt).await,
+        "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
+    );
+}
+
+#[test]
+fn test_encrypt_md5() {
+    assert_eq!(
+        password::md5(b"secret", "foo"),
+        "md54ab2c5d00339c4b2a4e921d2dc4edec7"
+    );
+}
diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs
new file mode 100644
index 0000000000..78131c05bf
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/types/mod.rs
@@ -0,0 +1,294 @@
+//! Conversions to and from Postgres's binary format for various types.
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{BufMut, BytesMut};
+use fallible_iterator::FallibleIterator;
+use std::boxed::Box as StdBox;
+use std::error::Error;
+use std::str;
+
+use crate::Oid;
+
+#[cfg(test)]
+mod test;
+
+/// Serializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value.
+#[inline]
+pub fn text_to_sql(v: &str, buf: &mut BytesMut) {
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value.
+#[inline]
+pub fn text_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    Ok(str::from_utf8(buf)?)
+}
+
+/// Deserializes a `"char"` value.
+#[inline]
+pub fn char_from_sql(mut buf: &[u8]) -> Result<i8, StdBox<dyn Error + Sync + Send>> {
+    let v = buf.read_i8()?;
+    if !buf.is_empty() {
+        return Err("invalid buffer size".into());
+    }
+    Ok(v)
+}
+
+/// Serializes an `OID` value.
+#[inline]
+pub fn oid_to_sql(v: Oid, buf: &mut BytesMut) {
+    buf.put_u32(v);
+}
+
+/// Deserializes an `OID` value.
+#[inline]
+pub fn oid_from_sql(mut buf: &[u8]) -> Result<Oid, StdBox<dyn Error + Sync + Send>> {
+    let v = buf.read_u32::<BigEndian>()?;
+    if !buf.is_empty() {
+        return Err("invalid buffer size".into());
+    }
+    Ok(v)
+}
+
+/// A fallible iterator over `HSTORE` entries.
+pub struct HstoreEntries<'a> {
+    remaining: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for HstoreEntries<'a> {
+    type Item = (&'a str, Option<&'a str>);
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    #[allow(clippy::type_complexity)]
+    fn next(
+        &mut self,
+    ) -> Result<Option<(&'a str, Option<&'a str>)>, StdBox<dyn Error + Sync + Send>> {
+        if self.remaining == 0 {
+            if !self.buf.is_empty() {
+                return Err("invalid buffer size".into());
+            }
+            return Ok(None);
+        }
+
+        self.remaining -= 1;
+
+        let key_len = self.buf.read_i32::<BigEndian>()?;
+        if key_len < 0 {
+            return Err("invalid key length".into());
+        }
+        let (key, buf) = self.buf.split_at(key_len as usize);
+        let key = str::from_utf8(key)?;
+        self.buf = buf;
+
+        let value_len = self.buf.read_i32::<BigEndian>()?;
+        let value = if value_len < 0 {
+            None
+        } else {
+            let (value, buf) = self.buf.split_at(value_len as usize);
+            let value = str::from_utf8(value)?;
+            self.buf = buf;
+            Some(value)
+        };
+
+        Ok(Some((key, value)))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+/// Deserializes an array value.
+#[inline]
+pub fn array_from_sql(mut buf: &[u8]) -> Result<Array<'_>, StdBox<dyn Error + Sync + Send>> {
+    let dimensions = buf.read_i32::<BigEndian>()?;
+    if dimensions < 0 {
+        return Err("invalid dimension count".into());
+    }
+
+    let mut r = buf;
+    let mut elements = 1i32;
+    for _ in 0..dimensions {
+        let len = r.read_i32::<BigEndian>()?;
+        if len < 0 {
+            return Err("invalid dimension size".into());
+        }
+        let _lower_bound = r.read_i32::<BigEndian>()?;
+        elements = match elements.checked_mul(len) {
+            Some(elements) => elements,
+            None => return Err("too many array elements".into()),
+        };
+    }
+
+    if dimensions == 0 {
+        elements = 0;
+    }
+
+    Ok(Array {
+        dimensions,
+        elements,
+        buf,
+    })
+}
+
+/// A Postgres array.
+pub struct Array<'a> {
+    dimensions: i32,
+    elements: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> Array<'a> {
+    /// Returns an iterator over the dimensions of the array.
+    #[inline]
+    pub fn dimensions(&self) -> ArrayDimensions<'a> {
+        ArrayDimensions(&self.buf[..self.dimensions as usize * 8])
+    }
+
+    /// Returns an iterator over the values of the array.
+    #[inline]
+    pub fn values(&self) -> ArrayValues<'a> {
+        ArrayValues {
+            remaining: self.elements,
+            buf: &self.buf[self.dimensions as usize * 8..],
+        }
+    }
+}
+
+/// An iterator over the dimensions of an array.
+pub struct ArrayDimensions<'a>(&'a [u8]);
+
+impl FallibleIterator for ArrayDimensions<'_> {
+    type Item = ArrayDimension;
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    fn next(&mut self) -> Result<Option<ArrayDimension>, StdBox<dyn Error + Sync + Send>> {
+        if self.0.is_empty() {
+            return Ok(None);
+        }
+
+        let len = self.0.read_i32::<BigEndian>()?;
+        let lower_bound = self.0.read_i32::<BigEndian>()?;
+
+        Ok(Some(ArrayDimension { len, lower_bound }))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.0.len() / 8;
+        (len, Some(len))
+    }
+}
+
+/// Information about a dimension of an array.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct ArrayDimension {
+    /// The length of this dimension.
+    pub len: i32,
+
+    /// The base value used to index into this dimension.
+    pub lower_bound: i32,
+}
+
+/// An iterator over the values of an array, in row-major order.
+pub struct ArrayValues<'a> {
+    remaining: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for ArrayValues<'a> {
+    type Item = Option<&'a [u8]>;
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    fn next(&mut self) -> Result<Option<Option<&'a [u8]>>, StdBox<dyn Error + Sync + Send>> {
+        if self.remaining == 0 {
+            if !self.buf.is_empty() {
+                return Err("invalid message length: arrayvalue not drained".into());
+            }
+            return Ok(None);
+        }
+        self.remaining -= 1;
+
+        let len = self.buf.read_i32::<BigEndian>()?;
+        let val = if len < 0 {
+            None
+        } else {
+            if self.buf.len() < len as usize {
+                return Err("invalid value length".into());
+            }
+
+            let (val, buf) = self.buf.split_at(len as usize);
+            self.buf = buf;
+            Some(val)
+        };
+
+        Ok(Some(val))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+/// Serializes a Postgres ltree string
+#[inline]
+pub fn ltree_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an ltree string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres ltree string
+#[inline]
+pub fn ltree_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the ltree per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("ltree version 1 only supported".into()),
+    }
+}
+
+/// Serializes a Postgres lquery string
+#[inline]
+pub fn lquery_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an lquery string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres lquery string
+#[inline]
+pub fn lquery_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the lquery per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("lquery version 1 only supported".into()),
+    }
+}
+
+/// Serializes a Postgres ltxtquery string
+#[inline]
+pub fn ltxtquery_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an ltxtquery string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres ltxtquery string
+#[inline]
+pub fn ltxtquery_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the ltxtquery per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("ltxtquery version 1 only supported".into()),
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/types/test.rs b/libs/proxy/postgres-protocol2/src/types/test.rs
new file mode 100644
index 0000000000..96cc055bc3
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/types/test.rs
@@ -0,0 +1,87 @@
+use bytes::{Buf, BytesMut};
+
+use super::*;
+
+#[test]
+fn ltree_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    ltree_to_sql("A.B.C", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn ltree_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn ltree_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_err())
+}
+
+#[test]
+fn lquery_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    lquery_to_sql("A.B.C", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn lquery_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(lquery_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn lquery_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(lquery_from_sql(query.as_slice()).is_err())
+}
+
+#[test]
+fn ltxtquery_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    ltree_to_sql("a & b*", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn ltxtquery_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn ltxtquery_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_err())
+}
diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml
new file mode 100644
index 0000000000..58cfb5571f
--- /dev/null
+++ b/libs/proxy/postgres-types2/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "postgres-types2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+bytes.workspace = true
+fallible-iterator.workspace = true
+postgres-protocol2 = { path = "../postgres-protocol2" }
diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs
new file mode 100644
index 0000000000..18ba032151
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -0,0 +1,477 @@
+//! Conversions to and from Postgres types.
+//!
+//! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it
+//! unless you want to define your own `ToSql` or `FromSql` definitions.
+#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")]
+#![warn(clippy::all, rust_2018_idioms, missing_docs)]
+
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::types;
+use std::any::type_name;
+use std::error::Error;
+use std::fmt;
+use std::sync::Arc;
+
+use crate::type_gen::{Inner, Other};
+
+#[doc(inline)]
+pub use postgres_protocol2::Oid;
+
+use bytes::BytesMut;
+
+/// Generates a simple implementation of `ToSql::accepts` which accepts the
+/// types passed to it.
+macro_rules! accepts {
+    ($($expected:ident),+) => (
+        fn accepts(ty: &$crate::Type) -> bool {
+            matches!(*ty, $($crate::Type::$expected)|+)
+        }
+    )
+}
+
+/// Generates an implementation of `ToSql::to_sql_checked`.
+///
+/// All `ToSql` implementations should use this macro.
+macro_rules! to_sql_checked {
+    () => {
+        fn to_sql_checked(
+            &self,
+            ty: &$crate::Type,
+            out: &mut $crate::private::BytesMut,
+        ) -> ::std::result::Result<
+            $crate::IsNull,
+            Box<dyn ::std::error::Error + ::std::marker::Sync + ::std::marker::Send>,
+        > {
+            $crate::__to_sql_checked(self, ty, out)
+        }
+    };
+}
+
+// WARNING: this function is not considered part of this crate's public API.
+// It is subject to change at any time.
+#[doc(hidden)]
+pub fn __to_sql_checked<T>(
+    v: &T,
+    ty: &Type,
+    out: &mut BytesMut,
+) -> Result<IsNull, Box<dyn Error + Sync + Send>>
+where
+    T: ToSql,
+{
+    if !T::accepts(ty) {
+        return Err(Box::new(WrongType::new::<T>(ty.clone())));
+    }
+    v.to_sql(ty, out)
+}
+
+// mod pg_lsn;
+#[doc(hidden)]
+pub mod private;
+// mod special;
+mod type_gen;
+
+/// A Postgres type.
+#[derive(PartialEq, Eq, Clone, Hash)]
+pub struct Type(Inner);
+
+impl fmt::Debug for Type {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.0, fmt)
+    }
+}
+
+impl fmt::Display for Type {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.schema() {
+            "public" | "pg_catalog" => {}
+            schema => write!(fmt, "{}.", schema)?,
+        }
+        fmt.write_str(self.name())
+    }
+}
+
+impl Type {
+    /// Creates a new `Type`.
+    pub fn new(name: String, oid: Oid, kind: Kind, schema: String) -> Type {
+        Type(Inner::Other(Arc::new(Other {
+            name,
+            oid,
+            kind,
+            schema,
+        })))
+    }
+
+    /// Returns the `Type` corresponding to the provided `Oid` if it
+    /// corresponds to a built-in type.
+    pub fn from_oid(oid: Oid) -> Option<Type> {
+        Inner::from_oid(oid).map(Type)
+    }
+
+    /// Returns the OID of the `Type`.
+    pub fn oid(&self) -> Oid {
+        self.0.oid()
+    }
+
+    /// Returns the kind of this type.
+    pub fn kind(&self) -> &Kind {
+        self.0.kind()
+    }
+
+    /// Returns the schema of this type.
+    pub fn schema(&self) -> &str {
+        match self.0 {
+            Inner::Other(ref u) => &u.schema,
+            _ => "pg_catalog",
+        }
+    }
+
+    /// Returns the name of this type.
+    pub fn name(&self) -> &str {
+        self.0.name()
+    }
+}
+
+/// Represents the kind of a Postgres type.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[non_exhaustive]
+pub enum Kind {
+    /// A simple type like `VARCHAR` or `INTEGER`.
+    Simple,
+    /// An enumerated type along with its variants.
+    Enum(Vec<String>),
+    /// A pseudo-type.
+    Pseudo,
+    /// An array type along with the type of its elements.
+    Array(Type),
+    /// A range type along with the type of its elements.
+    Range(Type),
+    /// A multirange type along with the type of its elements.
+    Multirange(Type),
+    /// A domain type along with its underlying type.
+    Domain(Type),
+    /// A composite type along with information about its fields.
+    Composite(Vec<Field>),
+}
+
+/// Information about a field of a composite type.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Field {
+    name: String,
+    type_: Type,
+}
+
+impl Field {
+    /// Creates a new `Field`.
+    pub fn new(name: String, type_: Type) -> Field {
+        Field { name, type_ }
+    }
+
+    /// Returns the name of the field.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Returns the type of the field.
+    pub fn type_(&self) -> &Type {
+        &self.type_
+    }
+}
+
+/// An error indicating that a `NULL` Postgres value was passed to a `FromSql`
+/// implementation that does not support `NULL` values.
+#[derive(Debug, Clone, Copy)]
+pub struct WasNull;
+
+impl fmt::Display for WasNull {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.write_str("a Postgres value was `NULL`")
+    }
+}
+
+impl Error for WasNull {}
+
+/// An error indicating that a conversion was attempted between incompatible
+/// Rust and Postgres types.
+#[derive(Debug)]
+pub struct WrongType {
+    postgres: Type,
+    rust: &'static str,
+}
+
+impl fmt::Display for WrongType {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            fmt,
+            "cannot convert between the Rust type `{}` and the Postgres type `{}`",
+            self.rust, self.postgres,
+        )
+    }
+}
+
+impl Error for WrongType {}
+
+impl WrongType {
+    /// Creates a new `WrongType` error.
+    pub fn new<T>(ty: Type) -> WrongType {
+        WrongType {
+            postgres: ty,
+            rust: type_name::<T>(),
+        }
+    }
+}
+
+/// An error indicating that a as_text conversion was attempted on a binary
+/// result.
+#[derive(Debug)]
+pub struct WrongFormat {}
+
+impl Error for WrongFormat {}
+
+impl fmt::Display for WrongFormat {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            fmt,
+            "cannot read column as text while it is in binary format"
+        )
+    }
+}
+
+/// A trait for types that can be created from a Postgres value.
+pub trait FromSql<'a>: Sized {
+    /// Creates a new value of this type from a buffer of data of the specified
+    /// Postgres `Type` in its binary format.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Self, Box<dyn Error + Sync + Send>>;
+
+    /// Creates a new value of this type from a `NULL` SQL value.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    ///
+    /// The default implementation returns `Err(Box::new(WasNull))`.
+    #[allow(unused_variables)]
+    fn from_sql_null(ty: &Type) -> Result<Self, Box<dyn Error + Sync + Send>> {
+        Err(Box::new(WasNull))
+    }
+
+    /// A convenience function that delegates to `from_sql` and `from_sql_null` depending on the
+    /// value of `raw`.
+    fn from_sql_nullable(
+        ty: &Type,
+        raw: Option<&'a [u8]>,
+    ) -> Result<Self, Box<dyn Error + Sync + Send>> {
+        match raw {
+            Some(raw) => Self::from_sql(ty, raw),
+            None => Self::from_sql_null(ty),
+        }
+    }
+
+    /// Determines if a value of this type can be created from the specified
+    /// Postgres `Type`.
+    fn accepts(ty: &Type) -> bool;
+}
+
+/// A trait for types which can be created from a Postgres value without borrowing any data.
+///
+/// This is primarily useful for trait bounds on functions.
+pub trait FromSqlOwned: for<'a> FromSql<'a> {}
+
+impl<T> FromSqlOwned for T where T: for<'a> FromSql<'a> {}
+
+impl<'a, T: FromSql<'a>> FromSql<'a> for Option<T> {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Option<T>, Box<dyn Error + Sync + Send>> {
+        <T as FromSql>::from_sql(ty, raw).map(Some)
+    }
+
+    fn from_sql_null(_: &Type) -> Result<Option<T>, Box<dyn Error + Sync + Send>> {
+        Ok(None)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        <T as FromSql>::accepts(ty)
+    }
+}
+
+impl<'a, T: FromSql<'a>> FromSql<'a> for Vec<T> {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Vec<T>, Box<dyn Error + Sync + Send>> {
+        let member_type = match *ty.kind() {
+            Kind::Array(ref member) => member,
+            _ => panic!("expected array type"),
+        };
+
+        let array = types::array_from_sql(raw)?;
+        if array.dimensions().count()? > 1 {
+            return Err("array contains too many dimensions".into());
+        }
+
+        array
+            .values()
+            .map(|v| T::from_sql_nullable(member_type, v))
+            .collect()
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty.kind() {
+            Kind::Array(ref inner) => T::accepts(inner),
+            _ => false,
+        }
+    }
+}
+
+impl<'a> FromSql<'a> for String {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<String, Box<dyn Error + Sync + Send>> {
+        <&str as FromSql>::from_sql(ty, raw).map(ToString::to_string)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        <&str as FromSql>::accepts(ty)
+    }
+}
+
+impl<'a> FromSql<'a> for &'a str {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<&'a str, Box<dyn Error + Sync + Send>> {
+        match *ty {
+            ref ty if ty.name() == "ltree" => types::ltree_from_sql(raw),
+            ref ty if ty.name() == "lquery" => types::lquery_from_sql(raw),
+            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_from_sql(raw),
+            _ => types::text_from_sql(raw),
+        }
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty {
+            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
+            ref ty
+                if (ty.name() == "citext"
+                    || ty.name() == "ltree"
+                    || ty.name() == "lquery"
+                    || ty.name() == "ltxtquery") =>
+            {
+                true
+            }
+            _ => false,
+        }
+    }
+}
+
+macro_rules! simple_from {
+    ($t:ty, $f:ident, $($expected:ident),+) => {
+        impl<'a> FromSql<'a> for $t {
+            fn from_sql(_: &Type, raw: &'a [u8]) -> Result<$t, Box<dyn Error + Sync + Send>> {
+                types::$f(raw)
+            }
+
+            accepts!($($expected),+);
+        }
+    }
+}
+
+simple_from!(i8, char_from_sql, CHAR);
+simple_from!(u32, oid_from_sql, OID);
+
+/// An enum representing the nullability of a Postgres value.
+pub enum IsNull {
+    /// The value is NULL.
+    Yes,
+    /// The value is not NULL.
+    No,
+}
+
+/// A trait for types that can be converted into Postgres values.
+pub trait ToSql: fmt::Debug {
+    /// Converts the value of `self` into the binary format of the specified
+    /// Postgres `Type`, appending it to `out`.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    ///
+    /// The return value indicates if this value should be represented as
+    /// `NULL`. If this is the case, implementations **must not** write
+    /// anything to `out`.
+    fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>>
+    where
+        Self: Sized;
+
+    /// Determines if a value of this type can be converted to the specified
+    /// Postgres `Type`.
+    fn accepts(ty: &Type) -> bool
+    where
+        Self: Sized;
+
+    /// An adaptor method used internally by Rust-Postgres.
+    ///
+    /// *All* implementations of this method should be generated by the
+    /// `to_sql_checked!()` macro.
+    fn to_sql_checked(
+        &self,
+        ty: &Type,
+        out: &mut BytesMut,
+    ) -> Result<IsNull, Box<dyn Error + Sync + Send>>;
+
+    /// Specify the encode format
+    fn encode_format(&self, _ty: &Type) -> Format {
+        Format::Binary
+    }
+}
+
+/// Supported Postgres message format types
+///
+/// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8`
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum Format {
+    /// Text format (UTF-8)
+    Text,
+    /// Compact, typed binary format
+    Binary,
+}
+
+impl ToSql for &str {
+    fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>> {
+        match *ty {
+            ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w),
+            ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w),
+            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w),
+            _ => types::text_to_sql(self, w),
+        }
+        Ok(IsNull::No)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty {
+            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
+            ref ty
+                if (ty.name() == "citext"
+                    || ty.name() == "ltree"
+                    || ty.name() == "lquery"
+                    || ty.name() == "ltxtquery") =>
+            {
+                true
+            }
+            _ => false,
+        }
+    }
+
+    to_sql_checked!();
+}
+
+macro_rules! simple_to {
+    ($t:ty, $f:ident, $($expected:ident),+) => {
+        impl ToSql for $t {
+            fn to_sql(&self,
+                      _: &Type,
+                      w: &mut BytesMut)
+                      -> Result<IsNull, Box<dyn Error + Sync + Send>> {
+                types::$f(*self, w);
+                Ok(IsNull::No)
+            }
+
+            accepts!($($expected),+);
+
+            to_sql_checked!();
+        }
+    }
+}
+
+simple_to!(u32, oid_to_sql, OID);
diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs
new file mode 100644
index 0000000000..774f9a301c
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/private.rs
@@ -0,0 +1,34 @@
+use crate::{FromSql, Type};
+pub use bytes::BytesMut;
+use std::error::Error;
+
+pub fn read_be_i32(buf: &mut &[u8]) -> Result<i32, Box<dyn Error + Sync + Send>> {
+    if buf.len() < 4 {
+        return Err("invalid buffer size".into());
+    }
+    let mut bytes = [0; 4];
+    bytes.copy_from_slice(&buf[..4]);
+    *buf = &buf[4..];
+    Ok(i32::from_be_bytes(bytes))
+}
+
+pub fn read_value<'a, T>(
+    type_: &Type,
+    buf: &mut &'a [u8],
+) -> Result<T, Box<dyn Error + Sync + Send>>
+where
+    T: FromSql<'a>,
+{
+    let len = read_be_i32(buf)?;
+    let value = if len < 0 {
+        None
+    } else {
+        if len as usize > buf.len() {
+            return Err("invalid buffer size".into());
+        }
+        let (head, tail) = buf.split_at(len as usize);
+        *buf = tail;
+        Some(head)
+    };
+    T::from_sql_nullable(type_, value)
+}
diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs
new file mode 100644
index 0000000000..a1bc3f85c0
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/type_gen.rs
@@ -0,0 +1,1524 @@
+// Autogenerated file - DO NOT EDIT
+use std::sync::Arc;
+
+use crate::{Kind, Oid, Type};
+
+#[derive(PartialEq, Eq, Debug, Hash)]
+pub struct Other {
+    pub name: String,
+    pub oid: Oid,
+    pub kind: Kind,
+    pub schema: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Hash)]
+pub enum Inner {
+    Bool,
+    Bytea,
+    Char,
+    Name,
+    Int8,
+    Int2,
+    Int2Vector,
+    Int4,
+    Regproc,
+    Text,
+    Oid,
+    Tid,
+    Xid,
+    Cid,
+    OidVector,
+    PgDdlCommand,
+    Json,
+    Xml,
+    XmlArray,
+    PgNodeTree,
+    JsonArray,
+    TableAmHandler,
+    Xid8Array,
+    IndexAmHandler,
+    Point,
+    Lseg,
+    Path,
+    Box,
+    Polygon,
+    Line,
+    LineArray,
+    Cidr,
+    CidrArray,
+    Float4,
+    Float8,
+    Unknown,
+    Circle,
+    CircleArray,
+    Macaddr8,
+    Macaddr8Array,
+    Money,
+    MoneyArray,
+    Macaddr,
+    Inet,
+    BoolArray,
+    ByteaArray,
+    CharArray,
+    NameArray,
+    Int2Array,
+    Int2VectorArray,
+    Int4Array,
+    RegprocArray,
+    TextArray,
+    TidArray,
+    XidArray,
+    CidArray,
+    OidVectorArray,
+    BpcharArray,
+    VarcharArray,
+    Int8Array,
+    PointArray,
+    LsegArray,
+    PathArray,
+    BoxArray,
+    Float4Array,
+    Float8Array,
+    PolygonArray,
+    OidArray,
+    Aclitem,
+    AclitemArray,
+    MacaddrArray,
+    InetArray,
+    Bpchar,
+    Varchar,
+    Date,
+    Time,
+    Timestamp,
+    TimestampArray,
+    DateArray,
+    TimeArray,
+    Timestamptz,
+    TimestamptzArray,
+    Interval,
+    IntervalArray,
+    NumericArray,
+    CstringArray,
+    Timetz,
+    TimetzArray,
+    Bit,
+    BitArray,
+    Varbit,
+    VarbitArray,
+    Numeric,
+    Refcursor,
+    RefcursorArray,
+    Regprocedure,
+    Regoper,
+    Regoperator,
+    Regclass,
+    Regtype,
+    RegprocedureArray,
+    RegoperArray,
+    RegoperatorArray,
+    RegclassArray,
+    RegtypeArray,
+    Record,
+    Cstring,
+    Any,
+    Anyarray,
+    Void,
+    Trigger,
+    LanguageHandler,
+    Internal,
+    Anyelement,
+    RecordArray,
+    Anynonarray,
+    TxidSnapshotArray,
+    Uuid,
+    UuidArray,
+    TxidSnapshot,
+    FdwHandler,
+    PgLsn,
+    PgLsnArray,
+    TsmHandler,
+    PgNdistinct,
+    PgDependencies,
+    Anyenum,
+    TsVector,
+    Tsquery,
+    GtsVector,
+    TsVectorArray,
+    GtsVectorArray,
+    TsqueryArray,
+    Regconfig,
+    RegconfigArray,
+    Regdictionary,
+    RegdictionaryArray,
+    Jsonb,
+    JsonbArray,
+    AnyRange,
+    EventTrigger,
+    Int4Range,
+    Int4RangeArray,
+    NumRange,
+    NumRangeArray,
+    TsRange,
+    TsRangeArray,
+    TstzRange,
+    TstzRangeArray,
+    DateRange,
+    DateRangeArray,
+    Int8Range,
+    Int8RangeArray,
+    Jsonpath,
+    JsonpathArray,
+    Regnamespace,
+    RegnamespaceArray,
+    Regrole,
+    RegroleArray,
+    Regcollation,
+    RegcollationArray,
+    Int4multiRange,
+    NummultiRange,
+    TsmultiRange,
+    TstzmultiRange,
+    DatemultiRange,
+    Int8multiRange,
+    AnymultiRange,
+    AnycompatiblemultiRange,
+    PgBrinBloomSummary,
+    PgBrinMinmaxMultiSummary,
+    PgMcvList,
+    PgSnapshot,
+    PgSnapshotArray,
+    Xid8,
+    Anycompatible,
+    Anycompatiblearray,
+    Anycompatiblenonarray,
+    AnycompatibleRange,
+    Int4multiRangeArray,
+    NummultiRangeArray,
+    TsmultiRangeArray,
+    TstzmultiRangeArray,
+    DatemultiRangeArray,
+    Int8multiRangeArray,
+    Other(Arc<Other>),
+}
+
+impl Inner {
+    pub fn from_oid(oid: Oid) -> Option<Inner> {
+        match oid {
+            16 => Some(Inner::Bool),
+            17 => Some(Inner::Bytea),
+            18 => Some(Inner::Char),
+            19 => Some(Inner::Name),
+            20 => Some(Inner::Int8),
+            21 => Some(Inner::Int2),
+            22 => Some(Inner::Int2Vector),
+            23 => Some(Inner::Int4),
+            24 => Some(Inner::Regproc),
+            25 => Some(Inner::Text),
+            26 => Some(Inner::Oid),
+            27 => Some(Inner::Tid),
+            28 => Some(Inner::Xid),
+            29 => Some(Inner::Cid),
+            30 => Some(Inner::OidVector),
+            32 => Some(Inner::PgDdlCommand),
+            114 => Some(Inner::Json),
+            142 => Some(Inner::Xml),
+            143 => Some(Inner::XmlArray),
+            194 => Some(Inner::PgNodeTree),
+            199 => Some(Inner::JsonArray),
+            269 => Some(Inner::TableAmHandler),
+            271 => Some(Inner::Xid8Array),
+            325 => Some(Inner::IndexAmHandler),
+            600 => Some(Inner::Point),
+            601 => Some(Inner::Lseg),
+            602 => Some(Inner::Path),
+            603 => Some(Inner::Box),
+            604 => Some(Inner::Polygon),
+            628 => Some(Inner::Line),
+            629 => Some(Inner::LineArray),
+            650 => Some(Inner::Cidr),
+            651 => Some(Inner::CidrArray),
+            700 => Some(Inner::Float4),
+            701 => Some(Inner::Float8),
+            705 => Some(Inner::Unknown),
+            718 => Some(Inner::Circle),
+            719 => Some(Inner::CircleArray),
+            774 => Some(Inner::Macaddr8),
+            775 => Some(Inner::Macaddr8Array),
+            790 => Some(Inner::Money),
+            791 => Some(Inner::MoneyArray),
+            829 => Some(Inner::Macaddr),
+            869 => Some(Inner::Inet),
+            1000 => Some(Inner::BoolArray),
+            1001 => Some(Inner::ByteaArray),
+            1002 => Some(Inner::CharArray),
+            1003 => Some(Inner::NameArray),
+            1005 => Some(Inner::Int2Array),
+            1006 => Some(Inner::Int2VectorArray),
+            1007 => Some(Inner::Int4Array),
+            1008 => Some(Inner::RegprocArray),
+            1009 => Some(Inner::TextArray),
+            1010 => Some(Inner::TidArray),
+            1011 => Some(Inner::XidArray),
+            1012 => Some(Inner::CidArray),
+            1013 => Some(Inner::OidVectorArray),
+            1014 => Some(Inner::BpcharArray),
+            1015 => Some(Inner::VarcharArray),
+            1016 => Some(Inner::Int8Array),
+            1017 => Some(Inner::PointArray),
+            1018 => Some(Inner::LsegArray),
+            1019 => Some(Inner::PathArray),
+            1020 => Some(Inner::BoxArray),
+            1021 => Some(Inner::Float4Array),
+            1022 => Some(Inner::Float8Array),
+            1027 => Some(Inner::PolygonArray),
+            1028 => Some(Inner::OidArray),
+            1033 => Some(Inner::Aclitem),
+            1034 => Some(Inner::AclitemArray),
+            1040 => Some(Inner::MacaddrArray),
+            1041 => Some(Inner::InetArray),
+            1042 => Some(Inner::Bpchar),
+            1043 => Some(Inner::Varchar),
+            1082 => Some(Inner::Date),
+            1083 => Some(Inner::Time),
+            1114 => Some(Inner::Timestamp),
+            1115 => Some(Inner::TimestampArray),
+            1182 => Some(Inner::DateArray),
+            1183 => Some(Inner::TimeArray),
+            1184 => Some(Inner::Timestamptz),
+            1185 => Some(Inner::TimestamptzArray),
+            1186 => Some(Inner::Interval),
+            1187 => Some(Inner::IntervalArray),
+            1231 => Some(Inner::NumericArray),
+            1263 => Some(Inner::CstringArray),
+            1266 => Some(Inner::Timetz),
+            1270 => Some(Inner::TimetzArray),
+            1560 => Some(Inner::Bit),
+            1561 => Some(Inner::BitArray),
+            1562 => Some(Inner::Varbit),
+            1563 => Some(Inner::VarbitArray),
+            1700 => Some(Inner::Numeric),
+            1790 => Some(Inner::Refcursor),
+            2201 => Some(Inner::RefcursorArray),
+            2202 => Some(Inner::Regprocedure),
+            2203 => Some(Inner::Regoper),
+            2204 => Some(Inner::Regoperator),
+            2205 => Some(Inner::Regclass),
+            2206 => Some(Inner::Regtype),
+            2207 => Some(Inner::RegprocedureArray),
+            2208 => Some(Inner::RegoperArray),
+            2209 => Some(Inner::RegoperatorArray),
+            2210 => Some(Inner::RegclassArray),
+            2211 => Some(Inner::RegtypeArray),
+            2249 => Some(Inner::Record),
+            2275 => Some(Inner::Cstring),
+            2276 => Some(Inner::Any),
+            2277 => Some(Inner::Anyarray),
+            2278 => Some(Inner::Void),
+            2279 => Some(Inner::Trigger),
+            2280 => Some(Inner::LanguageHandler),
+            2281 => Some(Inner::Internal),
+            2283 => Some(Inner::Anyelement),
+            2287 => Some(Inner::RecordArray),
+            2776 => Some(Inner::Anynonarray),
+            2949 => Some(Inner::TxidSnapshotArray),
+            2950 => Some(Inner::Uuid),
+            2951 => Some(Inner::UuidArray),
+            2970 => Some(Inner::TxidSnapshot),
+            3115 => Some(Inner::FdwHandler),
+            3220 => Some(Inner::PgLsn),
+            3221 => Some(Inner::PgLsnArray),
+            3310 => Some(Inner::TsmHandler),
+            3361 => Some(Inner::PgNdistinct),
+            3402 => Some(Inner::PgDependencies),
+            3500 => Some(Inner::Anyenum),
+            3614 => Some(Inner::TsVector),
+            3615 => Some(Inner::Tsquery),
+            3642 => Some(Inner::GtsVector),
+            3643 => Some(Inner::TsVectorArray),
+            3644 => Some(Inner::GtsVectorArray),
+            3645 => Some(Inner::TsqueryArray),
+            3734 => Some(Inner::Regconfig),
+            3735 => Some(Inner::RegconfigArray),
+            3769 => Some(Inner::Regdictionary),
+            3770 => Some(Inner::RegdictionaryArray),
+            3802 => Some(Inner::Jsonb),
+            3807 => Some(Inner::JsonbArray),
+            3831 => Some(Inner::AnyRange),
+            3838 => Some(Inner::EventTrigger),
+            3904 => Some(Inner::Int4Range),
+            3905 => Some(Inner::Int4RangeArray),
+            3906 => Some(Inner::NumRange),
+            3907 => Some(Inner::NumRangeArray),
+            3908 => Some(Inner::TsRange),
+            3909 => Some(Inner::TsRangeArray),
+            3910 => Some(Inner::TstzRange),
+            3911 => Some(Inner::TstzRangeArray),
+            3912 => Some(Inner::DateRange),
+            3913 => Some(Inner::DateRangeArray),
+            3926 => Some(Inner::Int8Range),
+            3927 => Some(Inner::Int8RangeArray),
+            4072 => Some(Inner::Jsonpath),
+            4073 => Some(Inner::JsonpathArray),
+            4089 => Some(Inner::Regnamespace),
+            4090 => Some(Inner::RegnamespaceArray),
+            4096 => Some(Inner::Regrole),
+            4097 => Some(Inner::RegroleArray),
+            4191 => Some(Inner::Regcollation),
+            4192 => Some(Inner::RegcollationArray),
+            4451 => Some(Inner::Int4multiRange),
+            4532 => Some(Inner::NummultiRange),
+            4533 => Some(Inner::TsmultiRange),
+            4534 => Some(Inner::TstzmultiRange),
+            4535 => Some(Inner::DatemultiRange),
+            4536 => Some(Inner::Int8multiRange),
+            4537 => Some(Inner::AnymultiRange),
+            4538 => Some(Inner::AnycompatiblemultiRange),
+            4600 => Some(Inner::PgBrinBloomSummary),
+            4601 => Some(Inner::PgBrinMinmaxMultiSummary),
+            5017 => Some(Inner::PgMcvList),
+            5038 => Some(Inner::PgSnapshot),
+            5039 => Some(Inner::PgSnapshotArray),
+            5069 => Some(Inner::Xid8),
+            5077 => Some(Inner::Anycompatible),
+            5078 => Some(Inner::Anycompatiblearray),
+            5079 => Some(Inner::Anycompatiblenonarray),
+            5080 => Some(Inner::AnycompatibleRange),
+            6150 => Some(Inner::Int4multiRangeArray),
+            6151 => Some(Inner::NummultiRangeArray),
+            6152 => Some(Inner::TsmultiRangeArray),
+            6153 => Some(Inner::TstzmultiRangeArray),
+            6155 => Some(Inner::DatemultiRangeArray),
+            6157 => Some(Inner::Int8multiRangeArray),
+            _ => None,
+        }
+    }
+
+    pub fn oid(&self) -> Oid {
+        match *self {
+            Inner::Bool => 16,
+            Inner::Bytea => 17,
+            Inner::Char => 18,
+            Inner::Name => 19,
+            Inner::Int8 => 20,
+            Inner::Int2 => 21,
+            Inner::Int2Vector => 22,
+            Inner::Int4 => 23,
+            Inner::Regproc => 24,
+            Inner::Text => 25,
+            Inner::Oid => 26,
+            Inner::Tid => 27,
+            Inner::Xid => 28,
+            Inner::Cid => 29,
+            Inner::OidVector => 30,
+            Inner::PgDdlCommand => 32,
+            Inner::Json => 114,
+            Inner::Xml => 142,
+            Inner::XmlArray => 143,
+            Inner::PgNodeTree => 194,
+            Inner::JsonArray => 199,
+            Inner::TableAmHandler => 269,
+            Inner::Xid8Array => 271,
+            Inner::IndexAmHandler => 325,
+            Inner::Point => 600,
+            Inner::Lseg => 601,
+            Inner::Path => 602,
+            Inner::Box => 603,
+            Inner::Polygon => 604,
+            Inner::Line => 628,
+            Inner::LineArray => 629,
+            Inner::Cidr => 650,
+            Inner::CidrArray => 651,
+            Inner::Float4 => 700,
+            Inner::Float8 => 701,
+            Inner::Unknown => 705,
+            Inner::Circle => 718,
+            Inner::CircleArray => 719,
+            Inner::Macaddr8 => 774,
+            Inner::Macaddr8Array => 775,
+            Inner::Money => 790,
+            Inner::MoneyArray => 791,
+            Inner::Macaddr => 829,
+            Inner::Inet => 869,
+            Inner::BoolArray => 1000,
+            Inner::ByteaArray => 1001,
+            Inner::CharArray => 1002,
+            Inner::NameArray => 1003,
+            Inner::Int2Array => 1005,
+            Inner::Int2VectorArray => 1006,
+            Inner::Int4Array => 1007,
+            Inner::RegprocArray => 1008,
+            Inner::TextArray => 1009,
+            Inner::TidArray => 1010,
+            Inner::XidArray => 1011,
+            Inner::CidArray => 1012,
+            Inner::OidVectorArray => 1013,
+            Inner::BpcharArray => 1014,
+            Inner::VarcharArray => 1015,
+            Inner::Int8Array => 1016,
+            Inner::PointArray => 1017,
+            Inner::LsegArray => 1018,
+            Inner::PathArray => 1019,
+            Inner::BoxArray => 1020,
+            Inner::Float4Array => 1021,
+            Inner::Float8Array => 1022,
+            Inner::PolygonArray => 1027,
+            Inner::OidArray => 1028,
+            Inner::Aclitem => 1033,
+            Inner::AclitemArray => 1034,
+            Inner::MacaddrArray => 1040,
+            Inner::InetArray => 1041,
+            Inner::Bpchar => 1042,
+            Inner::Varchar => 1043,
+            Inner::Date => 1082,
+            Inner::Time => 1083,
+            Inner::Timestamp => 1114,
+            Inner::TimestampArray => 1115,
+            Inner::DateArray => 1182,
+            Inner::TimeArray => 1183,
+            Inner::Timestamptz => 1184,
+            Inner::TimestamptzArray => 1185,
+            Inner::Interval => 1186,
+            Inner::IntervalArray => 1187,
+            Inner::NumericArray => 1231,
+            Inner::CstringArray => 1263,
+            Inner::Timetz => 1266,
+            Inner::TimetzArray => 1270,
+            Inner::Bit => 1560,
+            Inner::BitArray => 1561,
+            Inner::Varbit => 1562,
+            Inner::VarbitArray => 1563,
+            Inner::Numeric => 1700,
+            Inner::Refcursor => 1790,
+            Inner::RefcursorArray => 2201,
+            Inner::Regprocedure => 2202,
+            Inner::Regoper => 2203,
+            Inner::Regoperator => 2204,
+            Inner::Regclass => 2205,
+            Inner::Regtype => 2206,
+            Inner::RegprocedureArray => 2207,
+            Inner::RegoperArray => 2208,
+            Inner::RegoperatorArray => 2209,
+            Inner::RegclassArray => 2210,
+            Inner::RegtypeArray => 2211,
+            Inner::Record => 2249,
+            Inner::Cstring => 2275,
+            Inner::Any => 2276,
+            Inner::Anyarray => 2277,
+            Inner::Void => 2278,
+            Inner::Trigger => 2279,
+            Inner::LanguageHandler => 2280,
+            Inner::Internal => 2281,
+            Inner::Anyelement => 2283,
+            Inner::RecordArray => 2287,
+            Inner::Anynonarray => 2776,
+            Inner::TxidSnapshotArray => 2949,
+            Inner::Uuid => 2950,
+            Inner::UuidArray => 2951,
+            Inner::TxidSnapshot => 2970,
+            Inner::FdwHandler => 3115,
+            Inner::PgLsn => 3220,
+            Inner::PgLsnArray => 3221,
+            Inner::TsmHandler => 3310,
+            Inner::PgNdistinct => 3361,
+            Inner::PgDependencies => 3402,
+            Inner::Anyenum => 3500,
+            Inner::TsVector => 3614,
+            Inner::Tsquery => 3615,
+            Inner::GtsVector => 3642,
+            Inner::TsVectorArray => 3643,
+            Inner::GtsVectorArray => 3644,
+            Inner::TsqueryArray => 3645,
+            Inner::Regconfig => 3734,
+            Inner::RegconfigArray => 3735,
+            Inner::Regdictionary => 3769,
+            Inner::RegdictionaryArray => 3770,
+            Inner::Jsonb => 3802,
+            Inner::JsonbArray => 3807,
+            Inner::AnyRange => 3831,
+            Inner::EventTrigger => 3838,
+            Inner::Int4Range => 3904,
+            Inner::Int4RangeArray => 3905,
+            Inner::NumRange => 3906,
+            Inner::NumRangeArray => 3907,
+            Inner::TsRange => 3908,
+            Inner::TsRangeArray => 3909,
+            Inner::TstzRange => 3910,
+            Inner::TstzRangeArray => 3911,
+            Inner::DateRange => 3912,
+            Inner::DateRangeArray => 3913,
+            Inner::Int8Range => 3926,
+            Inner::Int8RangeArray => 3927,
+            Inner::Jsonpath => 4072,
+            Inner::JsonpathArray => 4073,
+            Inner::Regnamespace => 4089,
+            Inner::RegnamespaceArray => 4090,
+            Inner::Regrole => 4096,
+            Inner::RegroleArray => 4097,
+            Inner::Regcollation => 4191,
+            Inner::RegcollationArray => 4192,
+            Inner::Int4multiRange => 4451,
+            Inner::NummultiRange => 4532,
+            Inner::TsmultiRange => 4533,
+            Inner::TstzmultiRange => 4534,
+            Inner::DatemultiRange => 4535,
+            Inner::Int8multiRange => 4536,
+            Inner::AnymultiRange => 4537,
+            Inner::AnycompatiblemultiRange => 4538,
+            Inner::PgBrinBloomSummary => 4600,
+            Inner::PgBrinMinmaxMultiSummary => 4601,
+            Inner::PgMcvList => 5017,
+            Inner::PgSnapshot => 5038,
+            Inner::PgSnapshotArray => 5039,
+            Inner::Xid8 => 5069,
+            Inner::Anycompatible => 5077,
+            Inner::Anycompatiblearray => 5078,
+            Inner::Anycompatiblenonarray => 5079,
+            Inner::AnycompatibleRange => 5080,
+            Inner::Int4multiRangeArray => 6150,
+            Inner::NummultiRangeArray => 6151,
+            Inner::TsmultiRangeArray => 6152,
+            Inner::TstzmultiRangeArray => 6153,
+            Inner::DatemultiRangeArray => 6155,
+            Inner::Int8multiRangeArray => 6157,
+            Inner::Other(ref u) => u.oid,
+        }
+    }
+
+    pub fn kind(&self) -> &Kind {
+        match *self {
+            Inner::Bool => &Kind::Simple,
+            Inner::Bytea => &Kind::Simple,
+            Inner::Char => &Kind::Simple,
+            Inner::Name => &Kind::Simple,
+            Inner::Int8 => &Kind::Simple,
+            Inner::Int2 => &Kind::Simple,
+            Inner::Int2Vector => &Kind::Array(Type(Inner::Int2)),
+            Inner::Int4 => &Kind::Simple,
+            Inner::Regproc => &Kind::Simple,
+            Inner::Text => &Kind::Simple,
+            Inner::Oid => &Kind::Simple,
+            Inner::Tid => &Kind::Simple,
+            Inner::Xid => &Kind::Simple,
+            Inner::Cid => &Kind::Simple,
+            Inner::OidVector => &Kind::Array(Type(Inner::Oid)),
+            Inner::PgDdlCommand => &Kind::Pseudo,
+            Inner::Json => &Kind::Simple,
+            Inner::Xml => &Kind::Simple,
+            Inner::XmlArray => &Kind::Array(Type(Inner::Xml)),
+            Inner::PgNodeTree => &Kind::Simple,
+            Inner::JsonArray => &Kind::Array(Type(Inner::Json)),
+            Inner::TableAmHandler => &Kind::Pseudo,
+            Inner::Xid8Array => &Kind::Array(Type(Inner::Xid8)),
+            Inner::IndexAmHandler => &Kind::Pseudo,
+            Inner::Point => &Kind::Simple,
+            Inner::Lseg => &Kind::Simple,
+            Inner::Path => &Kind::Simple,
+            Inner::Box => &Kind::Simple,
+            Inner::Polygon => &Kind::Simple,
+            Inner::Line => &Kind::Simple,
+            Inner::LineArray => &Kind::Array(Type(Inner::Line)),
+            Inner::Cidr => &Kind::Simple,
+            Inner::CidrArray => &Kind::Array(Type(Inner::Cidr)),
+            Inner::Float4 => &Kind::Simple,
+            Inner::Float8 => &Kind::Simple,
+            Inner::Unknown => &Kind::Simple,
+            Inner::Circle => &Kind::Simple,
+            Inner::CircleArray => &Kind::Array(Type(Inner::Circle)),
+            Inner::Macaddr8 => &Kind::Simple,
+            Inner::Macaddr8Array => &Kind::Array(Type(Inner::Macaddr8)),
+            Inner::Money => &Kind::Simple,
+            Inner::MoneyArray => &Kind::Array(Type(Inner::Money)),
+            Inner::Macaddr => &Kind::Simple,
+            Inner::Inet => &Kind::Simple,
+            Inner::BoolArray => &Kind::Array(Type(Inner::Bool)),
+            Inner::ByteaArray => &Kind::Array(Type(Inner::Bytea)),
+            Inner::CharArray => &Kind::Array(Type(Inner::Char)),
+            Inner::NameArray => &Kind::Array(Type(Inner::Name)),
+            Inner::Int2Array => &Kind::Array(Type(Inner::Int2)),
+            Inner::Int2VectorArray => &Kind::Array(Type(Inner::Int2Vector)),
+            Inner::Int4Array => &Kind::Array(Type(Inner::Int4)),
+            Inner::RegprocArray => &Kind::Array(Type(Inner::Regproc)),
+            Inner::TextArray => &Kind::Array(Type(Inner::Text)),
+            Inner::TidArray => &Kind::Array(Type(Inner::Tid)),
+            Inner::XidArray => &Kind::Array(Type(Inner::Xid)),
+            Inner::CidArray => &Kind::Array(Type(Inner::Cid)),
+            Inner::OidVectorArray => &Kind::Array(Type(Inner::OidVector)),
+            Inner::BpcharArray => &Kind::Array(Type(Inner::Bpchar)),
+            Inner::VarcharArray => &Kind::Array(Type(Inner::Varchar)),
+            Inner::Int8Array => &Kind::Array(Type(Inner::Int8)),
+            Inner::PointArray => &Kind::Array(Type(Inner::Point)),
+            Inner::LsegArray => &Kind::Array(Type(Inner::Lseg)),
+            Inner::PathArray => &Kind::Array(Type(Inner::Path)),
+            Inner::BoxArray => &Kind::Array(Type(Inner::Box)),
+            Inner::Float4Array => &Kind::Array(Type(Inner::Float4)),
+            Inner::Float8Array => &Kind::Array(Type(Inner::Float8)),
+            Inner::PolygonArray => &Kind::Array(Type(Inner::Polygon)),
+            Inner::OidArray => &Kind::Array(Type(Inner::Oid)),
+            Inner::Aclitem => &Kind::Simple,
+            Inner::AclitemArray => &Kind::Array(Type(Inner::Aclitem)),
+            Inner::MacaddrArray => &Kind::Array(Type(Inner::Macaddr)),
+            Inner::InetArray => &Kind::Array(Type(Inner::Inet)),
+            Inner::Bpchar => &Kind::Simple,
+            Inner::Varchar => &Kind::Simple,
+            Inner::Date => &Kind::Simple,
+            Inner::Time => &Kind::Simple,
+            Inner::Timestamp => &Kind::Simple,
+            Inner::TimestampArray => &Kind::Array(Type(Inner::Timestamp)),
+            Inner::DateArray => &Kind::Array(Type(Inner::Date)),
+            Inner::TimeArray => &Kind::Array(Type(Inner::Time)),
+            Inner::Timestamptz => &Kind::Simple,
+            Inner::TimestamptzArray => &Kind::Array(Type(Inner::Timestamptz)),
+            Inner::Interval => &Kind::Simple,
+            Inner::IntervalArray => &Kind::Array(Type(Inner::Interval)),
+            Inner::NumericArray => &Kind::Array(Type(Inner::Numeric)),
+            Inner::CstringArray => &Kind::Array(Type(Inner::Cstring)),
+            Inner::Timetz => &Kind::Simple,
+            Inner::TimetzArray => &Kind::Array(Type(Inner::Timetz)),
+            Inner::Bit => &Kind::Simple,
+            Inner::BitArray => &Kind::Array(Type(Inner::Bit)),
+            Inner::Varbit => &Kind::Simple,
+            Inner::VarbitArray => &Kind::Array(Type(Inner::Varbit)),
+            Inner::Numeric => &Kind::Simple,
+            Inner::Refcursor => &Kind::Simple,
+            Inner::RefcursorArray => &Kind::Array(Type(Inner::Refcursor)),
+            Inner::Regprocedure => &Kind::Simple,
+            Inner::Regoper => &Kind::Simple,
+            Inner::Regoperator => &Kind::Simple,
+            Inner::Regclass => &Kind::Simple,
+            Inner::Regtype => &Kind::Simple,
+            Inner::RegprocedureArray => &Kind::Array(Type(Inner::Regprocedure)),
+            Inner::RegoperArray => &Kind::Array(Type(Inner::Regoper)),
+            Inner::RegoperatorArray => &Kind::Array(Type(Inner::Regoperator)),
+            Inner::RegclassArray => &Kind::Array(Type(Inner::Regclass)),
+            Inner::RegtypeArray => &Kind::Array(Type(Inner::Regtype)),
+            Inner::Record => &Kind::Pseudo,
+            Inner::Cstring => &Kind::Pseudo,
+            Inner::Any => &Kind::Pseudo,
+            Inner::Anyarray => &Kind::Pseudo,
+            Inner::Void => &Kind::Pseudo,
+            Inner::Trigger => &Kind::Pseudo,
+            Inner::LanguageHandler => &Kind::Pseudo,
+            Inner::Internal => &Kind::Pseudo,
+            Inner::Anyelement => &Kind::Pseudo,
+            Inner::RecordArray => &Kind::Pseudo,
+            Inner::Anynonarray => &Kind::Pseudo,
+            Inner::TxidSnapshotArray => &Kind::Array(Type(Inner::TxidSnapshot)),
+            Inner::Uuid => &Kind::Simple,
+            Inner::UuidArray => &Kind::Array(Type(Inner::Uuid)),
+            Inner::TxidSnapshot => &Kind::Simple,
+            Inner::FdwHandler => &Kind::Pseudo,
+            Inner::PgLsn => &Kind::Simple,
+            Inner::PgLsnArray => &Kind::Array(Type(Inner::PgLsn)),
+            Inner::TsmHandler => &Kind::Pseudo,
+            Inner::PgNdistinct => &Kind::Simple,
+            Inner::PgDependencies => &Kind::Simple,
+            Inner::Anyenum => &Kind::Pseudo,
+            Inner::TsVector => &Kind::Simple,
+            Inner::Tsquery => &Kind::Simple,
+            Inner::GtsVector => &Kind::Simple,
+            Inner::TsVectorArray => &Kind::Array(Type(Inner::TsVector)),
+            Inner::GtsVectorArray => &Kind::Array(Type(Inner::GtsVector)),
+            Inner::TsqueryArray => &Kind::Array(Type(Inner::Tsquery)),
+            Inner::Regconfig => &Kind::Simple,
+            Inner::RegconfigArray => &Kind::Array(Type(Inner::Regconfig)),
+            Inner::Regdictionary => &Kind::Simple,
+            Inner::RegdictionaryArray => &Kind::Array(Type(Inner::Regdictionary)),
+            Inner::Jsonb => &Kind::Simple,
+            Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)),
+            Inner::AnyRange => &Kind::Pseudo,
+            Inner::EventTrigger => &Kind::Pseudo,
+            Inner::Int4Range => &Kind::Range(Type(Inner::Int4)),
+            Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)),
+            Inner::NumRange => &Kind::Range(Type(Inner::Numeric)),
+            Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)),
+            Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)),
+            Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)),
+            Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)),
+            Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)),
+            Inner::DateRange => &Kind::Range(Type(Inner::Date)),
+            Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)),
+            Inner::Int8Range => &Kind::Range(Type(Inner::Int8)),
+            Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)),
+            Inner::Jsonpath => &Kind::Simple,
+            Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)),
+            Inner::Regnamespace => &Kind::Simple,
+            Inner::RegnamespaceArray => &Kind::Array(Type(Inner::Regnamespace)),
+            Inner::Regrole => &Kind::Simple,
+            Inner::RegroleArray => &Kind::Array(Type(Inner::Regrole)),
+            Inner::Regcollation => &Kind::Simple,
+            Inner::RegcollationArray => &Kind::Array(Type(Inner::Regcollation)),
+            Inner::Int4multiRange => &Kind::Multirange(Type(Inner::Int4)),
+            Inner::NummultiRange => &Kind::Multirange(Type(Inner::Numeric)),
+            Inner::TsmultiRange => &Kind::Multirange(Type(Inner::Timestamp)),
+            Inner::TstzmultiRange => &Kind::Multirange(Type(Inner::Timestamptz)),
+            Inner::DatemultiRange => &Kind::Multirange(Type(Inner::Date)),
+            Inner::Int8multiRange => &Kind::Multirange(Type(Inner::Int8)),
+            Inner::AnymultiRange => &Kind::Pseudo,
+            Inner::AnycompatiblemultiRange => &Kind::Pseudo,
+            Inner::PgBrinBloomSummary => &Kind::Simple,
+            Inner::PgBrinMinmaxMultiSummary => &Kind::Simple,
+            Inner::PgMcvList => &Kind::Simple,
+            Inner::PgSnapshot => &Kind::Simple,
+            Inner::PgSnapshotArray => &Kind::Array(Type(Inner::PgSnapshot)),
+            Inner::Xid8 => &Kind::Simple,
+            Inner::Anycompatible => &Kind::Pseudo,
+            Inner::Anycompatiblearray => &Kind::Pseudo,
+            Inner::Anycompatiblenonarray => &Kind::Pseudo,
+            Inner::AnycompatibleRange => &Kind::Pseudo,
+            Inner::Int4multiRangeArray => &Kind::Array(Type(Inner::Int4multiRange)),
+            Inner::NummultiRangeArray => &Kind::Array(Type(Inner::NummultiRange)),
+            Inner::TsmultiRangeArray => &Kind::Array(Type(Inner::TsmultiRange)),
+            Inner::TstzmultiRangeArray => &Kind::Array(Type(Inner::TstzmultiRange)),
+            Inner::DatemultiRangeArray => &Kind::Array(Type(Inner::DatemultiRange)),
+            Inner::Int8multiRangeArray => &Kind::Array(Type(Inner::Int8multiRange)),
+            Inner::Other(ref u) => &u.kind,
+        }
+    }
+
+    pub fn name(&self) -> &str {
+        match *self {
+            Inner::Bool => "bool",
+            Inner::Bytea => "bytea",
+            Inner::Char => "char",
+            Inner::Name => "name",
+            Inner::Int8 => "int8",
+            Inner::Int2 => "int2",
+            Inner::Int2Vector => "int2vector",
+            Inner::Int4 => "int4",
+            Inner::Regproc => "regproc",
+            Inner::Text => "text",
+            Inner::Oid => "oid",
+            Inner::Tid => "tid",
+            Inner::Xid => "xid",
+            Inner::Cid => "cid",
+            Inner::OidVector => "oidvector",
+            Inner::PgDdlCommand => "pg_ddl_command",
+            Inner::Json => "json",
+            Inner::Xml => "xml",
+            Inner::XmlArray => "_xml",
+            Inner::PgNodeTree => "pg_node_tree",
+            Inner::JsonArray => "_json",
+            Inner::TableAmHandler => "table_am_handler",
+            Inner::Xid8Array => "_xid8",
+            Inner::IndexAmHandler => "index_am_handler",
+            Inner::Point => "point",
+            Inner::Lseg => "lseg",
+            Inner::Path => "path",
+            Inner::Box => "box",
+            Inner::Polygon => "polygon",
+            Inner::Line => "line",
+            Inner::LineArray => "_line",
+            Inner::Cidr => "cidr",
+            Inner::CidrArray => "_cidr",
+            Inner::Float4 => "float4",
+            Inner::Float8 => "float8",
+            Inner::Unknown => "unknown",
+            Inner::Circle => "circle",
+            Inner::CircleArray => "_circle",
+            Inner::Macaddr8 => "macaddr8",
+            Inner::Macaddr8Array => "_macaddr8",
+            Inner::Money => "money",
+            Inner::MoneyArray => "_money",
+            Inner::Macaddr => "macaddr",
+            Inner::Inet => "inet",
+            Inner::BoolArray => "_bool",
+            Inner::ByteaArray => "_bytea",
+            Inner::CharArray => "_char",
+            Inner::NameArray => "_name",
+            Inner::Int2Array => "_int2",
+            Inner::Int2VectorArray => "_int2vector",
+            Inner::Int4Array => "_int4",
+            Inner::RegprocArray => "_regproc",
+            Inner::TextArray => "_text",
+            Inner::TidArray => "_tid",
+            Inner::XidArray => "_xid",
+            Inner::CidArray => "_cid",
+            Inner::OidVectorArray => "_oidvector",
+            Inner::BpcharArray => "_bpchar",
+            Inner::VarcharArray => "_varchar",
+            Inner::Int8Array => "_int8",
+            Inner::PointArray => "_point",
+            Inner::LsegArray => "_lseg",
+            Inner::PathArray => "_path",
+            Inner::BoxArray => "_box",
+            Inner::Float4Array => "_float4",
+            Inner::Float8Array => "_float8",
+            Inner::PolygonArray => "_polygon",
+            Inner::OidArray => "_oid",
+            Inner::Aclitem => "aclitem",
+            Inner::AclitemArray => "_aclitem",
+            Inner::MacaddrArray => "_macaddr",
+            Inner::InetArray => "_inet",
+            Inner::Bpchar => "bpchar",
+            Inner::Varchar => "varchar",
+            Inner::Date => "date",
+            Inner::Time => "time",
+            Inner::Timestamp => "timestamp",
+            Inner::TimestampArray => "_timestamp",
+            Inner::DateArray => "_date",
+            Inner::TimeArray => "_time",
+            Inner::Timestamptz => "timestamptz",
+            Inner::TimestamptzArray => "_timestamptz",
+            Inner::Interval => "interval",
+            Inner::IntervalArray => "_interval",
+            Inner::NumericArray => "_numeric",
+            Inner::CstringArray => "_cstring",
+            Inner::Timetz => "timetz",
+            Inner::TimetzArray => "_timetz",
+            Inner::Bit => "bit",
+            Inner::BitArray => "_bit",
+            Inner::Varbit => "varbit",
+            Inner::VarbitArray => "_varbit",
+            Inner::Numeric => "numeric",
+            Inner::Refcursor => "refcursor",
+            Inner::RefcursorArray => "_refcursor",
+            Inner::Regprocedure => "regprocedure",
+            Inner::Regoper => "regoper",
+            Inner::Regoperator => "regoperator",
+            Inner::Regclass => "regclass",
+            Inner::Regtype => "regtype",
+            Inner::RegprocedureArray => "_regprocedure",
+            Inner::RegoperArray => "_regoper",
+            Inner::RegoperatorArray => "_regoperator",
+            Inner::RegclassArray => "_regclass",
+            Inner::RegtypeArray => "_regtype",
+            Inner::Record => "record",
+            Inner::Cstring => "cstring",
+            Inner::Any => "any",
+            Inner::Anyarray => "anyarray",
+            Inner::Void => "void",
+            Inner::Trigger => "trigger",
+            Inner::LanguageHandler => "language_handler",
+            Inner::Internal => "internal",
+            Inner::Anyelement => "anyelement",
+            Inner::RecordArray => "_record",
+            Inner::Anynonarray => "anynonarray",
+            Inner::TxidSnapshotArray => "_txid_snapshot",
+            Inner::Uuid => "uuid",
+            Inner::UuidArray => "_uuid",
+            Inner::TxidSnapshot => "txid_snapshot",
+            Inner::FdwHandler => "fdw_handler",
+            Inner::PgLsn => "pg_lsn",
+            Inner::PgLsnArray => "_pg_lsn",
+            Inner::TsmHandler => "tsm_handler",
+            Inner::PgNdistinct => "pg_ndistinct",
+            Inner::PgDependencies => "pg_dependencies",
+            Inner::Anyenum => "anyenum",
+            Inner::TsVector => "tsvector",
+            Inner::Tsquery => "tsquery",
+            Inner::GtsVector => "gtsvector",
+            Inner::TsVectorArray => "_tsvector",
+            Inner::GtsVectorArray => "_gtsvector",
+            Inner::TsqueryArray => "_tsquery",
+            Inner::Regconfig => "regconfig",
+            Inner::RegconfigArray => "_regconfig",
+            Inner::Regdictionary => "regdictionary",
+            Inner::RegdictionaryArray => "_regdictionary",
+            Inner::Jsonb => "jsonb",
+            Inner::JsonbArray => "_jsonb",
+            Inner::AnyRange => "anyrange",
+            Inner::EventTrigger => "event_trigger",
+            Inner::Int4Range => "int4range",
+            Inner::Int4RangeArray => "_int4range",
+            Inner::NumRange => "numrange",
+            Inner::NumRangeArray => "_numrange",
+            Inner::TsRange => "tsrange",
+            Inner::TsRangeArray => "_tsrange",
+            Inner::TstzRange => "tstzrange",
+            Inner::TstzRangeArray => "_tstzrange",
+            Inner::DateRange => "daterange",
+            Inner::DateRangeArray => "_daterange",
+            Inner::Int8Range => "int8range",
+            Inner::Int8RangeArray => "_int8range",
+            Inner::Jsonpath => "jsonpath",
+            Inner::JsonpathArray => "_jsonpath",
+            Inner::Regnamespace => "regnamespace",
+            Inner::RegnamespaceArray => "_regnamespace",
+            Inner::Regrole => "regrole",
+            Inner::RegroleArray => "_regrole",
+            Inner::Regcollation => "regcollation",
+            Inner::RegcollationArray => "_regcollation",
+            Inner::Int4multiRange => "int4multirange",
+            Inner::NummultiRange => "nummultirange",
+            Inner::TsmultiRange => "tsmultirange",
+            Inner::TstzmultiRange => "tstzmultirange",
+            Inner::DatemultiRange => "datemultirange",
+            Inner::Int8multiRange => "int8multirange",
+            Inner::AnymultiRange => "anymultirange",
+            Inner::AnycompatiblemultiRange => "anycompatiblemultirange",
+            Inner::PgBrinBloomSummary => "pg_brin_bloom_summary",
+            Inner::PgBrinMinmaxMultiSummary => "pg_brin_minmax_multi_summary",
+            Inner::PgMcvList => "pg_mcv_list",
+            Inner::PgSnapshot => "pg_snapshot",
+            Inner::PgSnapshotArray => "_pg_snapshot",
+            Inner::Xid8 => "xid8",
+            Inner::Anycompatible => "anycompatible",
+            Inner::Anycompatiblearray => "anycompatiblearray",
+            Inner::Anycompatiblenonarray => "anycompatiblenonarray",
+            Inner::AnycompatibleRange => "anycompatiblerange",
+            Inner::Int4multiRangeArray => "_int4multirange",
+            Inner::NummultiRangeArray => "_nummultirange",
+            Inner::TsmultiRangeArray => "_tsmultirange",
+            Inner::TstzmultiRangeArray => "_tstzmultirange",
+            Inner::DatemultiRangeArray => "_datemultirange",
+            Inner::Int8multiRangeArray => "_int8multirange",
+            Inner::Other(ref u) => &u.name,
+        }
+    }
+}
+impl Type {
+    /// BOOL - boolean, &#39;true&#39;/&#39;false&#39;
+    pub const BOOL: Type = Type(Inner::Bool);
+
+    /// BYTEA - variable-length string, binary values escaped
+    pub const BYTEA: Type = Type(Inner::Bytea);
+
+    /// CHAR - single character
+    pub const CHAR: Type = Type(Inner::Char);
+
+    /// NAME - 63-byte type for storing system identifiers
+    pub const NAME: Type = Type(Inner::Name);
+
+    /// INT8 - ~18 digit integer, 8-byte storage
+    pub const INT8: Type = Type(Inner::Int8);
+
+    /// INT2 - -32 thousand to 32 thousand, 2-byte storage
+    pub const INT2: Type = Type(Inner::Int2);
+
+    /// INT2VECTOR - array of int2, used in system tables
+    pub const INT2_VECTOR: Type = Type(Inner::Int2Vector);
+
+    /// INT4 - -2 billion to 2 billion integer, 4-byte storage
+    pub const INT4: Type = Type(Inner::Int4);
+
+    /// REGPROC - registered procedure
+    pub const REGPROC: Type = Type(Inner::Regproc);
+
+    /// TEXT - variable-length string, no limit specified
+    pub const TEXT: Type = Type(Inner::Text);
+
+    /// OID - object identifier&#40;oid&#41;, maximum 4 billion
+    pub const OID: Type = Type(Inner::Oid);
+
+    /// TID - &#40;block, offset&#41;, physical location of tuple
+    pub const TID: Type = Type(Inner::Tid);
+
+    /// XID - transaction id
+    pub const XID: Type = Type(Inner::Xid);
+
+    /// CID - command identifier type, sequence in transaction id
+    pub const CID: Type = Type(Inner::Cid);
+
+    /// OIDVECTOR - array of oids, used in system tables
+    pub const OID_VECTOR: Type = Type(Inner::OidVector);
+
+    /// PG_DDL_COMMAND - internal type for passing CollectedCommand
+    pub const PG_DDL_COMMAND: Type = Type(Inner::PgDdlCommand);
+
+    /// JSON - JSON stored as text
+    pub const JSON: Type = Type(Inner::Json);
+
+    /// XML - XML content
+    pub const XML: Type = Type(Inner::Xml);
+
+    /// XML&#91;&#93;
+    pub const XML_ARRAY: Type = Type(Inner::XmlArray);
+
+    /// PG_NODE_TREE - string representing an internal node tree
+    pub const PG_NODE_TREE: Type = Type(Inner::PgNodeTree);
+
+    /// JSON&#91;&#93;
+    pub const JSON_ARRAY: Type = Type(Inner::JsonArray);
+
+    /// TABLE_AM_HANDLER
+    pub const TABLE_AM_HANDLER: Type = Type(Inner::TableAmHandler);
+
+    /// XID8&#91;&#93;
+    pub const XID8_ARRAY: Type = Type(Inner::Xid8Array);
+
+    /// INDEX_AM_HANDLER - pseudo-type for the result of an index AM handler function
+    pub const INDEX_AM_HANDLER: Type = Type(Inner::IndexAmHandler);
+
+    /// POINT - geometric point &#39;&#40;x, y&#41;&#39;
+    pub const POINT: Type = Type(Inner::Point);
+
+    /// LSEG - geometric line segment &#39;&#40;pt1,pt2&#41;&#39;
+    pub const LSEG: Type = Type(Inner::Lseg);
+
+    /// PATH - geometric path &#39;&#40;pt1,...&#41;&#39;
+    pub const PATH: Type = Type(Inner::Path);
+
+    /// BOX - geometric box &#39;&#40;lower left,upper right&#41;&#39;
+    pub const BOX: Type = Type(Inner::Box);
+
+    /// POLYGON - geometric polygon &#39;&#40;pt1,...&#41;&#39;
+    pub const POLYGON: Type = Type(Inner::Polygon);
+
+    /// LINE - geometric line
+    pub const LINE: Type = Type(Inner::Line);
+
+    /// LINE&#91;&#93;
+    pub const LINE_ARRAY: Type = Type(Inner::LineArray);
+
+    /// CIDR - network IP address/netmask, network address
+    pub const CIDR: Type = Type(Inner::Cidr);
+
+    /// CIDR&#91;&#93;
+    pub const CIDR_ARRAY: Type = Type(Inner::CidrArray);
+
+    /// FLOAT4 - single-precision floating point number, 4-byte storage
+    pub const FLOAT4: Type = Type(Inner::Float4);
+
+    /// FLOAT8 - double-precision floating point number, 8-byte storage
+    pub const FLOAT8: Type = Type(Inner::Float8);
+
+    /// UNKNOWN - pseudo-type representing an undetermined type
+    pub const UNKNOWN: Type = Type(Inner::Unknown);
+
+    /// CIRCLE - geometric circle &#39;&#40;center,radius&#41;&#39;
+    pub const CIRCLE: Type = Type(Inner::Circle);
+
+    /// CIRCLE&#91;&#93;
+    pub const CIRCLE_ARRAY: Type = Type(Inner::CircleArray);
+
+    /// MACADDR8 - XX:XX:XX:XX:XX:XX:XX:XX, MAC address
+    pub const MACADDR8: Type = Type(Inner::Macaddr8);
+
+    /// MACADDR8&#91;&#93;
+    pub const MACADDR8_ARRAY: Type = Type(Inner::Macaddr8Array);
+
+    /// MONEY - monetary amounts, &#36;d,ddd.cc
+    pub const MONEY: Type = Type(Inner::Money);
+
+    /// MONEY&#91;&#93;
+    pub const MONEY_ARRAY: Type = Type(Inner::MoneyArray);
+
+    /// MACADDR - XX:XX:XX:XX:XX:XX, MAC address
+    pub const MACADDR: Type = Type(Inner::Macaddr);
+
+    /// INET - IP address/netmask, host address, netmask optional
+    pub const INET: Type = Type(Inner::Inet);
+
+    /// BOOL&#91;&#93;
+    pub const BOOL_ARRAY: Type = Type(Inner::BoolArray);
+
+    /// BYTEA&#91;&#93;
+    pub const BYTEA_ARRAY: Type = Type(Inner::ByteaArray);
+
+    /// CHAR&#91;&#93;
+    pub const CHAR_ARRAY: Type = Type(Inner::CharArray);
+
+    /// NAME&#91;&#93;
+    pub const NAME_ARRAY: Type = Type(Inner::NameArray);
+
+    /// INT2&#91;&#93;
+    pub const INT2_ARRAY: Type = Type(Inner::Int2Array);
+
+    /// INT2VECTOR&#91;&#93;
+    pub const INT2_VECTOR_ARRAY: Type = Type(Inner::Int2VectorArray);
+
+    /// INT4&#91;&#93;
+    pub const INT4_ARRAY: Type = Type(Inner::Int4Array);
+
+    /// REGPROC&#91;&#93;
+    pub const REGPROC_ARRAY: Type = Type(Inner::RegprocArray);
+
+    /// TEXT&#91;&#93;
+    pub const TEXT_ARRAY: Type = Type(Inner::TextArray);
+
+    /// TID&#91;&#93;
+    pub const TID_ARRAY: Type = Type(Inner::TidArray);
+
+    /// XID&#91;&#93;
+    pub const XID_ARRAY: Type = Type(Inner::XidArray);
+
+    /// CID&#91;&#93;
+    pub const CID_ARRAY: Type = Type(Inner::CidArray);
+
+    /// OIDVECTOR&#91;&#93;
+    pub const OID_VECTOR_ARRAY: Type = Type(Inner::OidVectorArray);
+
+    /// BPCHAR&#91;&#93;
+    pub const BPCHAR_ARRAY: Type = Type(Inner::BpcharArray);
+
+    /// VARCHAR&#91;&#93;
+    pub const VARCHAR_ARRAY: Type = Type(Inner::VarcharArray);
+
+    /// INT8&#91;&#93;
+    pub const INT8_ARRAY: Type = Type(Inner::Int8Array);
+
+    /// POINT&#91;&#93;
+    pub const POINT_ARRAY: Type = Type(Inner::PointArray);
+
+    /// LSEG&#91;&#93;
+    pub const LSEG_ARRAY: Type = Type(Inner::LsegArray);
+
+    /// PATH&#91;&#93;
+    pub const PATH_ARRAY: Type = Type(Inner::PathArray);
+
+    /// BOX&#91;&#93;
+    pub const BOX_ARRAY: Type = Type(Inner::BoxArray);
+
+    /// FLOAT4&#91;&#93;
+    pub const FLOAT4_ARRAY: Type = Type(Inner::Float4Array);
+
+    /// FLOAT8&#91;&#93;
+    pub const FLOAT8_ARRAY: Type = Type(Inner::Float8Array);
+
+    /// POLYGON&#91;&#93;
+    pub const POLYGON_ARRAY: Type = Type(Inner::PolygonArray);
+
+    /// OID&#91;&#93;
+    pub const OID_ARRAY: Type = Type(Inner::OidArray);
+
+    /// ACLITEM - access control list
+    pub const ACLITEM: Type = Type(Inner::Aclitem);
+
+    /// ACLITEM&#91;&#93;
+    pub const ACLITEM_ARRAY: Type = Type(Inner::AclitemArray);
+
+    /// MACADDR&#91;&#93;
+    pub const MACADDR_ARRAY: Type = Type(Inner::MacaddrArray);
+
+    /// INET&#91;&#93;
+    pub const INET_ARRAY: Type = Type(Inner::InetArray);
+
+    /// BPCHAR - char&#40;length&#41;, blank-padded string, fixed storage length
+    pub const BPCHAR: Type = Type(Inner::Bpchar);
+
+    /// VARCHAR - varchar&#40;length&#41;, non-blank-padded string, variable storage length
+    pub const VARCHAR: Type = Type(Inner::Varchar);
+
+    /// DATE - date
+    pub const DATE: Type = Type(Inner::Date);
+
+    /// TIME - time of day
+    pub const TIME: Type = Type(Inner::Time);
+
+    /// TIMESTAMP - date and time
+    pub const TIMESTAMP: Type = Type(Inner::Timestamp);
+
+    /// TIMESTAMP&#91;&#93;
+    pub const TIMESTAMP_ARRAY: Type = Type(Inner::TimestampArray);
+
+    /// DATE&#91;&#93;
+    pub const DATE_ARRAY: Type = Type(Inner::DateArray);
+
+    /// TIME&#91;&#93;
+    pub const TIME_ARRAY: Type = Type(Inner::TimeArray);
+
+    /// TIMESTAMPTZ - date and time with time zone
+    pub const TIMESTAMPTZ: Type = Type(Inner::Timestamptz);
+
+    /// TIMESTAMPTZ&#91;&#93;
+    pub const TIMESTAMPTZ_ARRAY: Type = Type(Inner::TimestamptzArray);
+
+    /// INTERVAL - &#64; &lt;number&gt; &lt;units&gt;, time interval
+    pub const INTERVAL: Type = Type(Inner::Interval);
+
+    /// INTERVAL&#91;&#93;
+    pub const INTERVAL_ARRAY: Type = Type(Inner::IntervalArray);
+
+    /// NUMERIC&#91;&#93;
+    pub const NUMERIC_ARRAY: Type = Type(Inner::NumericArray);
+
+    /// CSTRING&#91;&#93;
+    pub const CSTRING_ARRAY: Type = Type(Inner::CstringArray);
+
+    /// TIMETZ - time of day with time zone
+    pub const TIMETZ: Type = Type(Inner::Timetz);
+
+    /// TIMETZ&#91;&#93;
+    pub const TIMETZ_ARRAY: Type = Type(Inner::TimetzArray);
+
+    /// BIT - fixed-length bit string
+    pub const BIT: Type = Type(Inner::Bit);
+
+    /// BIT&#91;&#93;
+    pub const BIT_ARRAY: Type = Type(Inner::BitArray);
+
+    /// VARBIT - variable-length bit string
+    pub const VARBIT: Type = Type(Inner::Varbit);
+
+    /// VARBIT&#91;&#93;
+    pub const VARBIT_ARRAY: Type = Type(Inner::VarbitArray);
+
+    /// NUMERIC - numeric&#40;precision, decimal&#41;, arbitrary precision number
+    pub const NUMERIC: Type = Type(Inner::Numeric);
+
+    /// REFCURSOR - reference to cursor &#40;portal name&#41;
+    pub const REFCURSOR: Type = Type(Inner::Refcursor);
+
+    /// REFCURSOR&#91;&#93;
+    pub const REFCURSOR_ARRAY: Type = Type(Inner::RefcursorArray);
+
+    /// REGPROCEDURE - registered procedure &#40;with args&#41;
+    pub const REGPROCEDURE: Type = Type(Inner::Regprocedure);
+
+    /// REGOPER - registered operator
+    pub const REGOPER: Type = Type(Inner::Regoper);
+
+    /// REGOPERATOR - registered operator &#40;with args&#41;
+    pub const REGOPERATOR: Type = Type(Inner::Regoperator);
+
+    /// REGCLASS - registered class
+    pub const REGCLASS: Type = Type(Inner::Regclass);
+
+    /// REGTYPE - registered type
+    pub const REGTYPE: Type = Type(Inner::Regtype);
+
+    /// REGPROCEDURE&#91;&#93;
+    pub const REGPROCEDURE_ARRAY: Type = Type(Inner::RegprocedureArray);
+
+    /// REGOPER&#91;&#93;
+    pub const REGOPER_ARRAY: Type = Type(Inner::RegoperArray);
+
+    /// REGOPERATOR&#91;&#93;
+    pub const REGOPERATOR_ARRAY: Type = Type(Inner::RegoperatorArray);
+
+    /// REGCLASS&#91;&#93;
+    pub const REGCLASS_ARRAY: Type = Type(Inner::RegclassArray);
+
+    /// REGTYPE&#91;&#93;
+    pub const REGTYPE_ARRAY: Type = Type(Inner::RegtypeArray);
+
+    /// RECORD - pseudo-type representing any composite type
+    pub const RECORD: Type = Type(Inner::Record);
+
+    /// CSTRING - C-style string
+    pub const CSTRING: Type = Type(Inner::Cstring);
+
+    /// ANY - pseudo-type representing any type
+    pub const ANY: Type = Type(Inner::Any);
+
+    /// ANYARRAY - pseudo-type representing a polymorphic array type
+    pub const ANYARRAY: Type = Type(Inner::Anyarray);
+
+    /// VOID - pseudo-type for the result of a function with no real result
+    pub const VOID: Type = Type(Inner::Void);
+
+    /// TRIGGER - pseudo-type for the result of a trigger function
+    pub const TRIGGER: Type = Type(Inner::Trigger);
+
+    /// LANGUAGE_HANDLER - pseudo-type for the result of a language handler function
+    pub const LANGUAGE_HANDLER: Type = Type(Inner::LanguageHandler);
+
+    /// INTERNAL - pseudo-type representing an internal data structure
+    pub const INTERNAL: Type = Type(Inner::Internal);
+
+    /// ANYELEMENT - pseudo-type representing a polymorphic base type
+    pub const ANYELEMENT: Type = Type(Inner::Anyelement);
+
+    /// RECORD&#91;&#93;
+    pub const RECORD_ARRAY: Type = Type(Inner::RecordArray);
+
+    /// ANYNONARRAY - pseudo-type representing a polymorphic base type that is not an array
+    pub const ANYNONARRAY: Type = Type(Inner::Anynonarray);
+
+    /// TXID_SNAPSHOT&#91;&#93;
+    pub const TXID_SNAPSHOT_ARRAY: Type = Type(Inner::TxidSnapshotArray);
+
+    /// UUID - UUID datatype
+    pub const UUID: Type = Type(Inner::Uuid);
+
+    /// UUID&#91;&#93;
+    pub const UUID_ARRAY: Type = Type(Inner::UuidArray);
+
+    /// TXID_SNAPSHOT - txid snapshot
+    pub const TXID_SNAPSHOT: Type = Type(Inner::TxidSnapshot);
+
+    /// FDW_HANDLER - pseudo-type for the result of an FDW handler function
+    pub const FDW_HANDLER: Type = Type(Inner::FdwHandler);
+
+    /// PG_LSN - PostgreSQL LSN datatype
+    pub const PG_LSN: Type = Type(Inner::PgLsn);
+
+    /// PG_LSN&#91;&#93;
+    pub const PG_LSN_ARRAY: Type = Type(Inner::PgLsnArray);
+
+    /// TSM_HANDLER - pseudo-type for the result of a tablesample method function
+    pub const TSM_HANDLER: Type = Type(Inner::TsmHandler);
+
+    /// PG_NDISTINCT - multivariate ndistinct coefficients
+    pub const PG_NDISTINCT: Type = Type(Inner::PgNdistinct);
+
+    /// PG_DEPENDENCIES - multivariate dependencies
+    pub const PG_DEPENDENCIES: Type = Type(Inner::PgDependencies);
+
+    /// ANYENUM - pseudo-type representing a polymorphic base type that is an enum
+    pub const ANYENUM: Type = Type(Inner::Anyenum);
+
+    /// TSVECTOR - text representation for text search
+    pub const TS_VECTOR: Type = Type(Inner::TsVector);
+
+    /// TSQUERY - query representation for text search
+    pub const TSQUERY: Type = Type(Inner::Tsquery);
+
+    /// GTSVECTOR - GiST index internal text representation for text search
+    pub const GTS_VECTOR: Type = Type(Inner::GtsVector);
+
+    /// TSVECTOR&#91;&#93;
+    pub const TS_VECTOR_ARRAY: Type = Type(Inner::TsVectorArray);
+
+    /// GTSVECTOR&#91;&#93;
+    pub const GTS_VECTOR_ARRAY: Type = Type(Inner::GtsVectorArray);
+
+    /// TSQUERY&#91;&#93;
+    pub const TSQUERY_ARRAY: Type = Type(Inner::TsqueryArray);
+
+    /// REGCONFIG - registered text search configuration
+    pub const REGCONFIG: Type = Type(Inner::Regconfig);
+
+    /// REGCONFIG&#91;&#93;
+    pub const REGCONFIG_ARRAY: Type = Type(Inner::RegconfigArray);
+
+    /// REGDICTIONARY - registered text search dictionary
+    pub const REGDICTIONARY: Type = Type(Inner::Regdictionary);
+
+    /// REGDICTIONARY&#91;&#93;
+    pub const REGDICTIONARY_ARRAY: Type = Type(Inner::RegdictionaryArray);
+
+    /// JSONB - Binary JSON
+    pub const JSONB: Type = Type(Inner::Jsonb);
+
+    /// JSONB&#91;&#93;
+    pub const JSONB_ARRAY: Type = Type(Inner::JsonbArray);
+
+    /// ANYRANGE - pseudo-type representing a range over a polymorphic base type
+    pub const ANY_RANGE: Type = Type(Inner::AnyRange);
+
+    /// EVENT_TRIGGER - pseudo-type for the result of an event trigger function
+    pub const EVENT_TRIGGER: Type = Type(Inner::EventTrigger);
+
+    /// INT4RANGE - range of integers
+    pub const INT4_RANGE: Type = Type(Inner::Int4Range);
+
+    /// INT4RANGE&#91;&#93;
+    pub const INT4_RANGE_ARRAY: Type = Type(Inner::Int4RangeArray);
+
+    /// NUMRANGE - range of numerics
+    pub const NUM_RANGE: Type = Type(Inner::NumRange);
+
+    /// NUMRANGE&#91;&#93;
+    pub const NUM_RANGE_ARRAY: Type = Type(Inner::NumRangeArray);
+
+    /// TSRANGE - range of timestamps without time zone
+    pub const TS_RANGE: Type = Type(Inner::TsRange);
+
+    /// TSRANGE&#91;&#93;
+    pub const TS_RANGE_ARRAY: Type = Type(Inner::TsRangeArray);
+
+    /// TSTZRANGE - range of timestamps with time zone
+    pub const TSTZ_RANGE: Type = Type(Inner::TstzRange);
+
+    /// TSTZRANGE&#91;&#93;
+    pub const TSTZ_RANGE_ARRAY: Type = Type(Inner::TstzRangeArray);
+
+    /// DATERANGE - range of dates
+    pub const DATE_RANGE: Type = Type(Inner::DateRange);
+
+    /// DATERANGE&#91;&#93;
+    pub const DATE_RANGE_ARRAY: Type = Type(Inner::DateRangeArray);
+
+    /// INT8RANGE - range of bigints
+    pub const INT8_RANGE: Type = Type(Inner::Int8Range);
+
+    /// INT8RANGE&#91;&#93;
+    pub const INT8_RANGE_ARRAY: Type = Type(Inner::Int8RangeArray);
+
+    /// JSONPATH - JSON path
+    pub const JSONPATH: Type = Type(Inner::Jsonpath);
+
+    /// JSONPATH&#91;&#93;
+    pub const JSONPATH_ARRAY: Type = Type(Inner::JsonpathArray);
+
+    /// REGNAMESPACE - registered namespace
+    pub const REGNAMESPACE: Type = Type(Inner::Regnamespace);
+
+    /// REGNAMESPACE&#91;&#93;
+    pub const REGNAMESPACE_ARRAY: Type = Type(Inner::RegnamespaceArray);
+
+    /// REGROLE - registered role
+    pub const REGROLE: Type = Type(Inner::Regrole);
+
+    /// REGROLE&#91;&#93;
+    pub const REGROLE_ARRAY: Type = Type(Inner::RegroleArray);
+
+    /// REGCOLLATION - registered collation
+    pub const REGCOLLATION: Type = Type(Inner::Regcollation);
+
+    /// REGCOLLATION&#91;&#93;
+    pub const REGCOLLATION_ARRAY: Type = Type(Inner::RegcollationArray);
+
+    /// INT4MULTIRANGE - multirange of integers
+    pub const INT4MULTI_RANGE: Type = Type(Inner::Int4multiRange);
+
+    /// NUMMULTIRANGE - multirange of numerics
+    pub const NUMMULTI_RANGE: Type = Type(Inner::NummultiRange);
+
+    /// TSMULTIRANGE - multirange of timestamps without time zone
+    pub const TSMULTI_RANGE: Type = Type(Inner::TsmultiRange);
+
+    /// TSTZMULTIRANGE - multirange of timestamps with time zone
+    pub const TSTZMULTI_RANGE: Type = Type(Inner::TstzmultiRange);
+
+    /// DATEMULTIRANGE - multirange of dates
+    pub const DATEMULTI_RANGE: Type = Type(Inner::DatemultiRange);
+
+    /// INT8MULTIRANGE - multirange of bigints
+    pub const INT8MULTI_RANGE: Type = Type(Inner::Int8multiRange);
+
+    /// ANYMULTIRANGE - pseudo-type representing a polymorphic base type that is a multirange
+    pub const ANYMULTI_RANGE: Type = Type(Inner::AnymultiRange);
+
+    /// ANYCOMPATIBLEMULTIRANGE - pseudo-type representing a multirange over a polymorphic common type
+    pub const ANYCOMPATIBLEMULTI_RANGE: Type = Type(Inner::AnycompatiblemultiRange);
+
+    /// PG_BRIN_BLOOM_SUMMARY - BRIN bloom summary
+    pub const PG_BRIN_BLOOM_SUMMARY: Type = Type(Inner::PgBrinBloomSummary);
+
+    /// PG_BRIN_MINMAX_MULTI_SUMMARY - BRIN minmax-multi summary
+    pub const PG_BRIN_MINMAX_MULTI_SUMMARY: Type = Type(Inner::PgBrinMinmaxMultiSummary);
+
+    /// PG_MCV_LIST - multivariate MCV list
+    pub const PG_MCV_LIST: Type = Type(Inner::PgMcvList);
+
+    /// PG_SNAPSHOT - snapshot
+    pub const PG_SNAPSHOT: Type = Type(Inner::PgSnapshot);
+
+    /// PG_SNAPSHOT&#91;&#93;
+    pub const PG_SNAPSHOT_ARRAY: Type = Type(Inner::PgSnapshotArray);
+
+    /// XID8 - full transaction id
+    pub const XID8: Type = Type(Inner::Xid8);
+
+    /// ANYCOMPATIBLE - pseudo-type representing a polymorphic common type
+    pub const ANYCOMPATIBLE: Type = Type(Inner::Anycompatible);
+
+    /// ANYCOMPATIBLEARRAY - pseudo-type representing an array of polymorphic common type elements
+    pub const ANYCOMPATIBLEARRAY: Type = Type(Inner::Anycompatiblearray);
+
+    /// ANYCOMPATIBLENONARRAY - pseudo-type representing a polymorphic common type that is not an array
+    pub const ANYCOMPATIBLENONARRAY: Type = Type(Inner::Anycompatiblenonarray);
+
+    /// ANYCOMPATIBLERANGE - pseudo-type representing a range over a polymorphic common type
+    pub const ANYCOMPATIBLE_RANGE: Type = Type(Inner::AnycompatibleRange);
+
+    /// INT4MULTIRANGE&#91;&#93;
+    pub const INT4MULTI_RANGE_ARRAY: Type = Type(Inner::Int4multiRangeArray);
+
+    /// NUMMULTIRANGE&#91;&#93;
+    pub const NUMMULTI_RANGE_ARRAY: Type = Type(Inner::NummultiRangeArray);
+
+    /// TSMULTIRANGE&#91;&#93;
+    pub const TSMULTI_RANGE_ARRAY: Type = Type(Inner::TsmultiRangeArray);
+
+    /// TSTZMULTIRANGE&#91;&#93;
+    pub const TSTZMULTI_RANGE_ARRAY: Type = Type(Inner::TstzmultiRangeArray);
+
+    /// DATEMULTIRANGE&#91;&#93;
+    pub const DATEMULTI_RANGE_ARRAY: Type = Type(Inner::DatemultiRangeArray);
+
+    /// INT8MULTIRANGE&#91;&#93;
+    pub const INT8MULTI_RANGE_ARRAY: Type = Type(Inner::Int8multiRangeArray);
+}
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
new file mode 100644
index 0000000000..7130c1b726
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "tokio-postgres2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+async-trait.workspace = true
+bytes.workspace = true
+byteorder.workspace = true
+fallible-iterator.workspace = true
+futures-util = { workspace = true, features = ["sink"] }
+log = "0.4"
+parking_lot.workspace = true
+percent-encoding = "2.0"
+pin-project-lite.workspace = true
+phf = "0.11"
+postgres-protocol2 = { path = "../postgres-protocol2" }
+postgres-types2 = { path = "../postgres-types2" }
+tokio = { workspace = true, features = ["io-util", "time", "net"] }
+tokio-util = { workspace = true, features = ["codec"] }
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs
new file mode 100644
index 0000000000..cddbf16336
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -0,0 +1,40 @@
+use tokio::net::TcpStream;
+
+use crate::client::SocketConfig;
+use crate::config::{Host, SslMode};
+use crate::tls::MakeTlsConnect;
+use crate::{cancel_query_raw, connect_socket, Error};
+use std::io;
+
+pub(crate) async fn cancel_query<T>(
+    config: Option<SocketConfig>,
+    ssl_mode: SslMode,
+    mut tls: T,
+    process_id: i32,
+    secret_key: i32,
+) -> Result<(), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    let config = match config {
+        Some(config) => config,
+        None => {
+            return Err(Error::connect(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "unknown host",
+            )))
+        }
+    };
+
+    let hostname = match &config.host {
+        Host::Tcp(host) => &**host,
+    };
+    let tls = tls
+        .make_tls_connect(hostname)
+        .map_err(|e| Error::tls(e.into()))?;
+
+    let socket =
+        connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
+
+    cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
+}
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
new file mode 100644
index 0000000000..8c08296435
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
@@ -0,0 +1,29 @@
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+use crate::{connect_tls, Error};
+use bytes::BytesMut;
+use postgres_protocol2::message::frontend;
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+
+pub async fn cancel_query_raw<S, T>(
+    stream: S,
+    mode: SslMode,
+    tls: T,
+    process_id: i32,
+    secret_key: i32,
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    let mut stream = connect_tls::connect_tls(stream, mode, tls).await?;
+
+    let mut buf = BytesMut::new();
+    frontend::cancel_request(process_id, secret_key, &mut buf);
+
+    stream.write_all(&buf).await.map_err(Error::io)?;
+    stream.flush().await.map_err(Error::io)?;
+    stream.shutdown().await.map_err(Error::io)?;
+
+    Ok(())
+}
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
new file mode 100644
index 0000000000..b949bf358f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -0,0 +1,62 @@
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+
+use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
+use crate::{cancel_query_raw, Error};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::TcpStream;
+
+/// The capability to request cancellation of in-progress queries on a
+/// connection.
+#[derive(Clone)]
+pub struct CancelToken {
+    pub(crate) socket_config: Option<SocketConfig>,
+    pub(crate) ssl_mode: SslMode,
+    pub(crate) process_id: i32,
+    pub(crate) secret_key: i32,
+}
+
+impl CancelToken {
+    /// Attempts to cancel the in-progress query on the connection associated
+    /// with this `CancelToken`.
+    ///
+    /// The server provides no information about whether a cancellation attempt was successful or not. An error will
+    /// only be returned if the client was unable to connect to the database.
+    ///
+    /// Cancellation is inherently racy. There is no guarantee that the
+    /// cancellation request will reach the server before the query terminates
+    /// normally, or that the connection associated with this token is still
+    /// active.
+    ///
+    /// Requires the `runtime` Cargo feature (enabled by default).
+    pub async fn cancel_query<T>(&self, tls: T) -> Result<(), Error>
+    where
+        T: MakeTlsConnect<TcpStream>,
+    {
+        cancel_query::cancel_query(
+            self.socket_config.clone(),
+            self.ssl_mode,
+            tls,
+            self.process_id,
+            self.secret_key,
+        )
+        .await
+    }
+
+    /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new
+    /// connection itself.
+    pub async fn cancel_query_raw<S, T>(&self, stream: S, tls: T) -> Result<(), Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsConnect<S>,
+    {
+        cancel_query_raw::cancel_query_raw(
+            stream,
+            self.ssl_mode,
+            tls,
+            self.process_id,
+            self.secret_key,
+        )
+        .await
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
new file mode 100644
index 0000000000..96200b71e7
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -0,0 +1,439 @@
+use crate::codec::{BackendMessages, FrontendMessage};
+
+use crate::config::Host;
+use crate::config::SslMode;
+use crate::connection::{Request, RequestMessages};
+
+use crate::query::RowStream;
+use crate::simple_query::SimpleQueryStream;
+
+use crate::types::{Oid, ToSql, Type};
+
+use crate::{
+    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{future, ready, TryStreamExt};
+use parking_lot::Mutex;
+use postgres_protocol2::message::{backend::Message, frontend};
+use std::collections::HashMap;
+use std::fmt;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tokio::sync::mpsc;
+
+use std::time::Duration;
+
+pub struct Responses {
+    receiver: mpsc::Receiver<BackendMessages>,
+    cur: BackendMessages,
+}
+
+impl Responses {
+    pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
+        loop {
+            match self.cur.next().map_err(Error::parse)? {
+                Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))),
+                Some(message) => return Poll::Ready(Ok(message)),
+                None => {}
+            }
+
+            match ready!(self.receiver.poll_recv(cx)) {
+                Some(messages) => self.cur = messages,
+                None => return Poll::Ready(Err(Error::closed())),
+            }
+        }
+    }
+
+    pub async fn next(&mut self) -> Result<Message, Error> {
+        future::poll_fn(|cx| self.poll_next(cx)).await
+    }
+}
+
+/// A cache of type info and prepared statements for fetching type info
+/// (corresponding to the queries in the [prepare] module).
+#[derive(Default)]
+struct CachedTypeInfo {
+    /// A statement for basic information for a type from its
+    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// fallback).
+    typeinfo: Option<Statement>,
+    /// A statement for getting information for a composite type from its OID.
+    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    typeinfo_composite: Option<Statement>,
+    /// A statement for getting information for a composite type from its OID.
+    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// its fallback).
+    typeinfo_enum: Option<Statement>,
+
+    /// Cache of types already looked up.
+    types: HashMap<Oid, Type>,
+}
+
+pub struct InnerClient {
+    sender: mpsc::UnboundedSender<Request>,
+    cached_typeinfo: Mutex<CachedTypeInfo>,
+
+    /// A buffer to use when writing out postgres commands.
+    buffer: Mutex<BytesMut>,
+}
+
+impl InnerClient {
+    pub fn send(&self, messages: RequestMessages) -> Result<Responses, Error> {
+        let (sender, receiver) = mpsc::channel(1);
+        let request = Request { messages, sender };
+        self.sender.send(request).map_err(|_| Error::closed())?;
+
+        Ok(Responses {
+            receiver,
+            cur: BackendMessages::empty(),
+        })
+    }
+
+    pub fn typeinfo(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo.clone()
+    }
+
+    pub fn set_typeinfo(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
+    }
+
+    pub fn typeinfo_composite(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo_composite.clone()
+    }
+
+    pub fn set_typeinfo_composite(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
+    }
+
+    pub fn typeinfo_enum(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo_enum.clone()
+    }
+
+    pub fn set_typeinfo_enum(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
+    }
+
+    pub fn type_(&self, oid: Oid) -> Option<Type> {
+        self.cached_typeinfo.lock().types.get(&oid).cloned()
+    }
+
+    pub fn set_type(&self, oid: Oid, type_: &Type) {
+        self.cached_typeinfo.lock().types.insert(oid, type_.clone());
+    }
+
+    /// Call the given function with a buffer to be used when writing out
+    /// postgres commands.
+    pub fn with_buf<F, R>(&self, f: F) -> R
+    where
+        F: FnOnce(&mut BytesMut) -> R,
+    {
+        let mut buffer = self.buffer.lock();
+        let r = f(&mut buffer);
+        buffer.clear();
+        r
+    }
+}
+
+#[derive(Clone)]
+pub(crate) struct SocketConfig {
+    pub host: Host,
+    pub port: u16,
+    pub connect_timeout: Option<Duration>,
+    // pub keepalive: Option<KeepaliveConfig>,
+}
+
+/// An asynchronous PostgreSQL client.
+///
+/// The client is one half of what is returned when a connection is established. Users interact with the database
+/// through this client object.
+pub struct Client {
+    inner: Arc<InnerClient>,
+
+    socket_config: Option<SocketConfig>,
+    ssl_mode: SslMode,
+    process_id: i32,
+    secret_key: i32,
+}
+
+impl Client {
+    pub(crate) fn new(
+        sender: mpsc::UnboundedSender<Request>,
+        ssl_mode: SslMode,
+        process_id: i32,
+        secret_key: i32,
+    ) -> Client {
+        Client {
+            inner: Arc::new(InnerClient {
+                sender,
+                cached_typeinfo: Default::default(),
+                buffer: Default::default(),
+            }),
+
+            socket_config: None,
+            ssl_mode,
+            process_id,
+            secret_key,
+        }
+    }
+
+    /// Returns process_id.
+    pub fn get_process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
+        &self.inner
+    }
+
+    pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
+        self.socket_config = Some(socket_config);
+    }
+
+    /// Creates a new prepared statement.
+    ///
+    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
+    /// which are set when executed. Prepared statements can only be used with the connection that created them.
+    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
+        self.prepare_typed(query, &[]).await
+    }
+
+    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
+    ///
+    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
+    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
+    pub async fn prepare_typed(
+        &self,
+        query: &str,
+        parameter_types: &[Type],
+    ) -> Result<Statement, Error> {
+        prepare::prepare(&self.inner, query, parameter_types).await
+    }
+
+    /// Executes a statement, returning a vector of the resulting rows.
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    pub async fn query<T>(
+        &self,
+        statement: &T,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> Result<Vec<Row>, Error>
+    where
+        T: ?Sized + ToStatement,
+    {
+        self.query_raw(statement, slice_iter(params))
+            .await?
+            .try_collect()
+            .await
+    }
+
+    /// The maximally flexible version of [`query`].
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    ///
+    /// [`query`]: #method.query
+    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    where
+        T: ?Sized + ToStatement,
+        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let statement = statement.__convert().into_statement(self).await?;
+        query::query(&self.inner, statement, params).await
+    }
+
+    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
+    /// to save a roundtrip
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str>,
+        I: IntoIterator<Item = Option<S>>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        query::query_txt(&self.inner, statement, params).await
+    }
+
+    /// Executes a statement, returning the number of rows modified.
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    pub async fn execute<T>(
+        &self,
+        statement: &T,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> Result<u64, Error>
+    where
+        T: ?Sized + ToStatement,
+    {
+        self.execute_raw(statement, slice_iter(params)).await
+    }
+
+    /// The maximally flexible version of [`execute`].
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    ///
+    /// [`execute`]: #method.execute
+    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
+    where
+        T: ?Sized + ToStatement,
+        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let statement = statement.__convert().into_statement(self).await?;
+        query::execute(self.inner(), statement, params).await
+    }
+
+    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
+    ///
+    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
+    /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings,
+    /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the
+    /// rows, this method returns a list of an enum which indicates either the completion of one of the commands,
+    /// or a row of data. This preserves the framing between the separate statements in the request.
+    ///
+    /// # Warning
+    ///
+    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
+    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
+    /// them to this method!
+    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
+        self.simple_query_raw(query).await?.try_collect().await
+    }
+
+    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
+        simple_query::simple_query(self.inner(), query).await
+    }
+
+    /// Executes a sequence of SQL statements using the simple query protocol.
+    ///
+    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
+    /// point. This is intended for use when, for example, initializing a database schema.
+    ///
+    /// # Warning
+    ///
+    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
+    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
+    /// them to this method!
+    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+        simple_query::batch_execute(self.inner(), query).await
+    }
+
+    /// Begins a new database transaction.
+    ///
+    /// The transaction will roll back by default - use the `commit` method to commit it.
+    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
+        struct RollbackIfNotDone<'me> {
+            client: &'me Client,
+            done: bool,
+        }
+
+        impl Drop for RollbackIfNotDone<'_> {
+            fn drop(&mut self) {
+                if self.done {
+                    return;
+                }
+
+                let buf = self.client.inner().with_buf(|buf| {
+                    frontend::query("ROLLBACK", buf).unwrap();
+                    buf.split().freeze()
+                });
+                let _ = self
+                    .client
+                    .inner()
+                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+            }
+        }
+
+        // This is done, as `Future` created by this method can be dropped after
+        // `RequestMessages` is synchronously send to the `Connection` by
+        // `batch_execute()`, but before `Responses` is asynchronously polled to
+        // completion. In that case `Transaction` won't be created and thus
+        // won't be rolled back.
+        {
+            let mut cleaner = RollbackIfNotDone {
+                client: self,
+                done: false,
+            };
+            self.batch_execute("BEGIN").await?;
+            cleaner.done = true;
+        }
+
+        Ok(Transaction::new(self))
+    }
+
+    /// Returns a builder for a transaction with custom settings.
+    ///
+    /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other
+    /// attributes.
+    pub fn build_transaction(&mut self) -> TransactionBuilder<'_> {
+        TransactionBuilder::new(self)
+    }
+
+    /// Constructs a cancellation token that can later be used to request cancellation of a query running on the
+    /// connection associated with this client.
+    pub fn cancel_token(&self) -> CancelToken {
+        CancelToken {
+            socket_config: self.socket_config.clone(),
+            ssl_mode: self.ssl_mode,
+            process_id: self.process_id,
+            secret_key: self.secret_key,
+        }
+    }
+
+    /// Query for type information
+    pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        crate::prepare::get_type(&self.inner, oid).await
+    }
+
+    /// Determines if the connection to the server has already closed.
+    ///
+    /// In that case, all future queries will fail.
+    pub fn is_closed(&self) -> bool {
+        self.inner.sender.is_closed()
+    }
+}
+
+impl fmt::Debug for Client {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Client").finish()
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
new file mode 100644
index 0000000000..7412db785b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -0,0 +1,109 @@
+use bytes::{Buf, Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend;
+use postgres_protocol2::message::frontend::CopyData;
+use std::io;
+use tokio_util::codec::{Decoder, Encoder};
+
+pub enum FrontendMessage {
+    Raw(Bytes),
+    CopyData(CopyData<Box<dyn Buf + Send>>),
+}
+
+pub enum BackendMessage {
+    Normal {
+        messages: BackendMessages,
+        request_complete: bool,
+    },
+    Async(backend::Message),
+}
+
+pub struct BackendMessages(BytesMut);
+
+impl BackendMessages {
+    pub fn empty() -> BackendMessages {
+        BackendMessages(BytesMut::new())
+    }
+}
+
+impl FallibleIterator for BackendMessages {
+    type Item = backend::Message;
+    type Error = io::Error;
+
+    fn next(&mut self) -> io::Result<Option<backend::Message>> {
+        backend::Message::parse(&mut self.0)
+    }
+}
+
+pub struct PostgresCodec {
+    pub max_message_size: Option<usize>,
+}
+
+impl Encoder<FrontendMessage> for PostgresCodec {
+    type Error = io::Error;
+
+    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
+        match item {
+            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
+            FrontendMessage::CopyData(data) => data.write(dst),
+        }
+
+        Ok(())
+    }
+}
+
+impl Decoder for PostgresCodec {
+    type Item = BackendMessage;
+    type Error = io::Error;
+
+    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
+        let mut idx = 0;
+        let mut request_complete = false;
+
+        while let Some(header) = backend::Header::parse(&src[idx..])? {
+            let len = header.len() as usize + 1;
+            if src[idx..].len() < len {
+                break;
+            }
+
+            if let Some(max) = self.max_message_size {
+                if len > max {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        "message too large",
+                    ));
+                }
+            }
+
+            match header.tag() {
+                backend::NOTICE_RESPONSE_TAG
+                | backend::NOTIFICATION_RESPONSE_TAG
+                | backend::PARAMETER_STATUS_TAG => {
+                    if idx == 0 {
+                        let message = backend::Message::parse(src)?.unwrap();
+                        return Ok(Some(BackendMessage::Async(message)));
+                    } else {
+                        break;
+                    }
+                }
+                _ => {}
+            }
+
+            idx += len;
+
+            if header.tag() == backend::READY_FOR_QUERY_TAG {
+                request_complete = true;
+                break;
+            }
+        }
+
+        if idx == 0 {
+            Ok(None)
+        } else {
+            Ok(Some(BackendMessage::Normal {
+                messages: BackendMessages(src.split_to(idx)),
+                request_complete,
+            }))
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
new file mode 100644
index 0000000000..969c20ba47
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -0,0 +1,897 @@
+//! Connection configuration.
+
+use crate::connect::connect;
+use crate::connect_raw::connect_raw;
+use crate::tls::MakeTlsConnect;
+use crate::tls::TlsConnect;
+use crate::{Client, Connection, Error};
+use std::borrow::Cow;
+use std::str;
+use std::str::FromStr;
+use std::time::Duration;
+use std::{error, fmt, iter, mem};
+use tokio::io::{AsyncRead, AsyncWrite};
+
+pub use postgres_protocol2::authentication::sasl::ScramKeys;
+use tokio::net::TcpStream;
+
+/// Properties required of a session.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum TargetSessionAttrs {
+    /// No special properties are required.
+    Any,
+    /// The session must allow writes.
+    ReadWrite,
+}
+
+/// TLS configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum SslMode {
+    /// Do not use TLS.
+    Disable,
+    /// Attempt to connect with TLS but allow sessions without.
+    Prefer,
+    /// Require the use of TLS.
+    Require,
+}
+
+/// Channel binding configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum ChannelBinding {
+    /// Do not use channel binding.
+    Disable,
+    /// Attempt to use channel binding but allow sessions without.
+    Prefer,
+    /// Require the use of channel binding.
+    Require,
+}
+
+/// Replication mode configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum ReplicationMode {
+    /// Physical replication.
+    Physical,
+    /// Logical replication.
+    Logical,
+}
+
+/// A host specification.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Host {
+    /// A TCP hostname.
+    Tcp(String),
+}
+
+/// Precomputed keys which may override password during auth.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AuthKeys {
+    /// A `ClientKey` & `ServerKey` pair for `SCRAM-SHA-256`.
+    ScramSha256(ScramKeys<32>),
+}
+
+/// Connection configuration.
+///
+/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
+///
+/// # Key-Value
+///
+/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
+/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
+///
+/// ## Keys
+///
+/// * `user` - The username to authenticate with. Required.
+/// * `password` - The password to authenticate with.
+/// * `dbname` - The name of the database to connect to. Defaults to the username.
+/// * `options` - Command line options used to configure the server.
+/// * `application_name` - Sets the `application_name` parameter on the server.
+/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
+///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
+/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
+///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
+///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
+///     with the `connect` method.
+/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
+///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
+///     omitted or the empty string.
+/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
+///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
+/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
+///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
+///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
+/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
+///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
+///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
+///
+/// ## Examples
+///
+/// ```not_rust
+/// host=localhost user=postgres connect_timeout=10 keepalives=0
+/// ```
+///
+/// ```not_rust
+/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
+/// ```
+///
+/// ```not_rust
+/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
+/// ```
+///
+/// # Url
+///
+/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
+/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
+/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
+/// as the path component of the URL specifies the database name.
+///
+/// ## Examples
+///
+/// ```not_rust
+/// postgresql://user@localhost
+/// ```
+///
+/// ```not_rust
+/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
+/// ```
+///
+/// ```not_rust
+/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
+/// ```
+///
+/// ```not_rust
+/// postgresql:///mydb?user=user&host=/var/lib/postgresql
+/// ```
+#[derive(Clone, PartialEq, Eq)]
+pub struct Config {
+    pub(crate) user: Option<String>,
+    pub(crate) password: Option<Vec<u8>>,
+    pub(crate) auth_keys: Option<Box<AuthKeys>>,
+    pub(crate) dbname: Option<String>,
+    pub(crate) options: Option<String>,
+    pub(crate) application_name: Option<String>,
+    pub(crate) ssl_mode: SslMode,
+    pub(crate) host: Vec<Host>,
+    pub(crate) port: Vec<u16>,
+    pub(crate) connect_timeout: Option<Duration>,
+    pub(crate) target_session_attrs: TargetSessionAttrs,
+    pub(crate) channel_binding: ChannelBinding,
+    pub(crate) replication_mode: Option<ReplicationMode>,
+    pub(crate) max_backend_message_size: Option<usize>,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config::new()
+    }
+}
+
+impl Config {
+    /// Creates a new configuration.
+    pub fn new() -> Config {
+        Config {
+            user: None,
+            password: None,
+            auth_keys: None,
+            dbname: None,
+            options: None,
+            application_name: None,
+            ssl_mode: SslMode::Prefer,
+            host: vec![],
+            port: vec![],
+            connect_timeout: None,
+            target_session_attrs: TargetSessionAttrs::Any,
+            channel_binding: ChannelBinding::Prefer,
+            replication_mode: None,
+            max_backend_message_size: None,
+        }
+    }
+
+    /// Sets the user to authenticate with.
+    ///
+    /// Required.
+    pub fn user(&mut self, user: &str) -> &mut Config {
+        self.user = Some(user.to_string());
+        self
+    }
+
+    /// Gets the user to authenticate with, if one has been configured with
+    /// the `user` method.
+    pub fn get_user(&self) -> Option<&str> {
+        self.user.as_deref()
+    }
+
+    /// Sets the password to authenticate with.
+    pub fn password<T>(&mut self, password: T) -> &mut Config
+    where
+        T: AsRef<[u8]>,
+    {
+        self.password = Some(password.as_ref().to_vec());
+        self
+    }
+
+    /// Gets the password to authenticate with, if one has been configured with
+    /// the `password` method.
+    pub fn get_password(&self) -> Option<&[u8]> {
+        self.password.as_deref()
+    }
+
+    /// Sets precomputed protocol-specific keys to authenticate with.
+    /// When set, this option will override `password`.
+    /// See [`AuthKeys`] for more information.
+    pub fn auth_keys(&mut self, keys: AuthKeys) -> &mut Config {
+        self.auth_keys = Some(Box::new(keys));
+        self
+    }
+
+    /// Gets precomputed protocol-specific keys to authenticate with.
+    /// if one has been configured with the `auth_keys` method.
+    pub fn get_auth_keys(&self) -> Option<AuthKeys> {
+        self.auth_keys.as_deref().copied()
+    }
+
+    /// Sets the name of the database to connect to.
+    ///
+    /// Defaults to the user.
+    pub fn dbname(&mut self, dbname: &str) -> &mut Config {
+        self.dbname = Some(dbname.to_string());
+        self
+    }
+
+    /// Gets the name of the database to connect to, if one has been configured
+    /// with the `dbname` method.
+    pub fn get_dbname(&self) -> Option<&str> {
+        self.dbname.as_deref()
+    }
+
+    /// Sets command line options used to configure the server.
+    pub fn options(&mut self, options: &str) -> &mut Config {
+        self.options = Some(options.to_string());
+        self
+    }
+
+    /// Gets the command line options used to configure the server, if the
+    /// options have been set with the `options` method.
+    pub fn get_options(&self) -> Option<&str> {
+        self.options.as_deref()
+    }
+
+    /// Sets the value of the `application_name` runtime parameter.
+    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
+        self.application_name = Some(application_name.to_string());
+        self
+    }
+
+    /// Gets the value of the `application_name` runtime parameter, if it has
+    /// been set with the `application_name` method.
+    pub fn get_application_name(&self) -> Option<&str> {
+        self.application_name.as_deref()
+    }
+
+    /// Sets the SSL configuration.
+    ///
+    /// Defaults to `prefer`.
+    pub fn ssl_mode(&mut self, ssl_mode: SslMode) -> &mut Config {
+        self.ssl_mode = ssl_mode;
+        self
+    }
+
+    /// Gets the SSL configuration.
+    pub fn get_ssl_mode(&self) -> SslMode {
+        self.ssl_mode
+    }
+
+    /// Adds a host to the configuration.
+    ///
+    /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
+    pub fn host(&mut self, host: &str) -> &mut Config {
+        self.host.push(Host::Tcp(host.to_string()));
+        self
+    }
+
+    /// Gets the hosts that have been added to the configuration with `host`.
+    pub fn get_hosts(&self) -> &[Host] {
+        &self.host
+    }
+
+    /// Adds a port to the configuration.
+    ///
+    /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
+    /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
+    /// as hosts.
+    pub fn port(&mut self, port: u16) -> &mut Config {
+        self.port.push(port);
+        self
+    }
+
+    /// Gets the ports that have been added to the configuration with `port`.
+    pub fn get_ports(&self) -> &[u16] {
+        &self.port
+    }
+
+    /// Sets the timeout applied to socket-level connection attempts.
+    ///
+    /// Note that hostnames can resolve to multiple IP addresses, and this timeout will apply to each address of each
+    /// host separately. Defaults to no limit.
+    pub fn connect_timeout(&mut self, connect_timeout: Duration) -> &mut Config {
+        self.connect_timeout = Some(connect_timeout);
+        self
+    }
+
+    /// Gets the connection timeout, if one has been set with the
+    /// `connect_timeout` method.
+    pub fn get_connect_timeout(&self) -> Option<&Duration> {
+        self.connect_timeout.as_ref()
+    }
+
+    /// Sets the requirements of the session.
+    ///
+    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
+    /// secondary servers. Defaults to `Any`.
+    pub fn target_session_attrs(
+        &mut self,
+        target_session_attrs: TargetSessionAttrs,
+    ) -> &mut Config {
+        self.target_session_attrs = target_session_attrs;
+        self
+    }
+
+    /// Gets the requirements of the session.
+    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
+        self.target_session_attrs
+    }
+
+    /// Sets the channel binding behavior.
+    ///
+    /// Defaults to `prefer`.
+    pub fn channel_binding(&mut self, channel_binding: ChannelBinding) -> &mut Config {
+        self.channel_binding = channel_binding;
+        self
+    }
+
+    /// Gets the channel binding behavior.
+    pub fn get_channel_binding(&self) -> ChannelBinding {
+        self.channel_binding
+    }
+
+    /// Set replication mode.
+    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
+        self.replication_mode = Some(replication_mode);
+        self
+    }
+
+    /// Get replication mode.
+    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
+        self.replication_mode
+    }
+
+    /// Set limit for backend messages size.
+    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
+        self.max_backend_message_size = Some(max_backend_message_size);
+        self
+    }
+
+    /// Get limit for backend messages size.
+    pub fn get_max_backend_message_size(&self) -> Option<usize> {
+        self.max_backend_message_size
+    }
+
+    fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
+        match key {
+            "user" => {
+                self.user(value);
+            }
+            "password" => {
+                self.password(value);
+            }
+            "dbname" => {
+                self.dbname(value);
+            }
+            "options" => {
+                self.options(value);
+            }
+            "application_name" => {
+                self.application_name(value);
+            }
+            "sslmode" => {
+                let mode = match value {
+                    "disable" => SslMode::Disable,
+                    "prefer" => SslMode::Prefer,
+                    "require" => SslMode::Require,
+                    _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
+                };
+                self.ssl_mode(mode);
+            }
+            "host" => {
+                for host in value.split(',') {
+                    self.host(host);
+                }
+            }
+            "port" => {
+                for port in value.split(',') {
+                    let port = if port.is_empty() {
+                        5432
+                    } else {
+                        port.parse()
+                            .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
+                    };
+                    self.port(port);
+                }
+            }
+            "connect_timeout" => {
+                let timeout = value
+                    .parse::<i64>()
+                    .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
+                if timeout > 0 {
+                    self.connect_timeout(Duration::from_secs(timeout as u64));
+                }
+            }
+            "target_session_attrs" => {
+                let target_session_attrs = match value {
+                    "any" => TargetSessionAttrs::Any,
+                    "read-write" => TargetSessionAttrs::ReadWrite,
+                    _ => {
+                        return Err(Error::config_parse(Box::new(InvalidValue(
+                            "target_session_attrs",
+                        ))));
+                    }
+                };
+                self.target_session_attrs(target_session_attrs);
+            }
+            "channel_binding" => {
+                let channel_binding = match value {
+                    "disable" => ChannelBinding::Disable,
+                    "prefer" => ChannelBinding::Prefer,
+                    "require" => ChannelBinding::Require,
+                    _ => {
+                        return Err(Error::config_parse(Box::new(InvalidValue(
+                            "channel_binding",
+                        ))))
+                    }
+                };
+                self.channel_binding(channel_binding);
+            }
+            "max_backend_message_size" => {
+                let limit = value.parse::<usize>().map_err(|_| {
+                    Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
+                })?;
+                if limit > 0 {
+                    self.max_backend_message_size(limit);
+                }
+            }
+            key => {
+                return Err(Error::config_parse(Box::new(UnknownOption(
+                    key.to_string(),
+                ))));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Opens a connection to a PostgreSQL database.
+    ///
+    /// Requires the `runtime` Cargo feature (enabled by default).
+    pub async fn connect<T>(
+        &self,
+        tls: T,
+    ) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+    where
+        T: MakeTlsConnect<TcpStream>,
+    {
+        connect(tls, self).await
+    }
+
+    /// Connects to a PostgreSQL database over an arbitrary stream.
+    ///
+    /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
+    pub async fn connect_raw<S, T>(
+        &self,
+        stream: S,
+        tls: T,
+    ) -> Result<(Client, Connection<S, T::Stream>), Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsConnect<S>,
+    {
+        connect_raw(stream, tls, self).await
+    }
+}
+
+impl FromStr for Config {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Config, Error> {
+        match UrlParser::parse(s)? {
+            Some(config) => Ok(config),
+            None => Parser::parse(s),
+        }
+    }
+}
+
+// Omit password from debug output
+impl fmt::Debug for Config {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        struct Redaction {}
+        impl fmt::Debug for Redaction {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                write!(f, "_")
+            }
+        }
+
+        f.debug_struct("Config")
+            .field("user", &self.user)
+            .field("password", &self.password.as_ref().map(|_| Redaction {}))
+            .field("dbname", &self.dbname)
+            .field("options", &self.options)
+            .field("application_name", &self.application_name)
+            .field("ssl_mode", &self.ssl_mode)
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("connect_timeout", &self.connect_timeout)
+            .field("target_session_attrs", &self.target_session_attrs)
+            .field("channel_binding", &self.channel_binding)
+            .field("replication", &self.replication_mode)
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+struct UnknownOption(String);
+
+impl fmt::Display for UnknownOption {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "unknown option `{}`", self.0)
+    }
+}
+
+impl error::Error for UnknownOption {}
+
+#[derive(Debug)]
+struct InvalidValue(&'static str);
+
+impl fmt::Display for InvalidValue {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "invalid value for option `{}`", self.0)
+    }
+}
+
+impl error::Error for InvalidValue {}
+
+struct Parser<'a> {
+    s: &'a str,
+    it: iter::Peekable<str::CharIndices<'a>>,
+}
+
+impl<'a> Parser<'a> {
+    fn parse(s: &'a str) -> Result<Config, Error> {
+        let mut parser = Parser {
+            s,
+            it: s.char_indices().peekable(),
+        };
+
+        let mut config = Config::new();
+
+        while let Some((key, value)) = parser.parameter()? {
+            config.param(key, &value)?;
+        }
+
+        Ok(config)
+    }
+
+    fn skip_ws(&mut self) {
+        self.take_while(char::is_whitespace);
+    }
+
+    fn take_while<F>(&mut self, f: F) -> &'a str
+    where
+        F: Fn(char) -> bool,
+    {
+        let start = match self.it.peek() {
+            Some(&(i, _)) => i,
+            None => return "",
+        };
+
+        loop {
+            match self.it.peek() {
+                Some(&(_, c)) if f(c) => {
+                    self.it.next();
+                }
+                Some(&(i, _)) => return &self.s[start..i],
+                None => return &self.s[start..],
+            }
+        }
+    }
+
+    fn eat(&mut self, target: char) -> Result<(), Error> {
+        match self.it.next() {
+            Some((_, c)) if c == target => Ok(()),
+            Some((i, c)) => {
+                let m = format!(
+                    "unexpected character at byte {}: expected `{}` but got `{}`",
+                    i, target, c
+                );
+                Err(Error::config_parse(m.into()))
+            }
+            None => Err(Error::config_parse("unexpected EOF".into())),
+        }
+    }
+
+    fn eat_if(&mut self, target: char) -> bool {
+        match self.it.peek() {
+            Some(&(_, c)) if c == target => {
+                self.it.next();
+                true
+            }
+            _ => false,
+        }
+    }
+
+    fn keyword(&mut self) -> Option<&'a str> {
+        let s = self.take_while(|c| match c {
+            c if c.is_whitespace() => false,
+            '=' => false,
+            _ => true,
+        });
+
+        if s.is_empty() {
+            None
+        } else {
+            Some(s)
+        }
+    }
+
+    fn value(&mut self) -> Result<String, Error> {
+        let value = if self.eat_if('\'') {
+            let value = self.quoted_value()?;
+            self.eat('\'')?;
+            value
+        } else {
+            self.simple_value()?
+        };
+
+        Ok(value)
+    }
+
+    fn simple_value(&mut self) -> Result<String, Error> {
+        let mut value = String::new();
+
+        while let Some(&(_, c)) = self.it.peek() {
+            if c.is_whitespace() {
+                break;
+            }
+
+            self.it.next();
+            if c == '\\' {
+                if let Some((_, c2)) = self.it.next() {
+                    value.push(c2);
+                }
+            } else {
+                value.push(c);
+            }
+        }
+
+        if value.is_empty() {
+            return Err(Error::config_parse("unexpected EOF".into()));
+        }
+
+        Ok(value)
+    }
+
+    fn quoted_value(&mut self) -> Result<String, Error> {
+        let mut value = String::new();
+
+        while let Some(&(_, c)) = self.it.peek() {
+            if c == '\'' {
+                return Ok(value);
+            }
+
+            self.it.next();
+            if c == '\\' {
+                if let Some((_, c2)) = self.it.next() {
+                    value.push(c2);
+                }
+            } else {
+                value.push(c);
+            }
+        }
+
+        Err(Error::config_parse(
+            "unterminated quoted connection parameter value".into(),
+        ))
+    }
+
+    fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
+        self.skip_ws();
+        let keyword = match self.keyword() {
+            Some(keyword) => keyword,
+            None => return Ok(None),
+        };
+        self.skip_ws();
+        self.eat('=')?;
+        self.skip_ws();
+        let value = self.value()?;
+
+        Ok(Some((keyword, value)))
+    }
+}
+
+// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
+struct UrlParser<'a> {
+    s: &'a str,
+    config: Config,
+}
+
+impl<'a> UrlParser<'a> {
+    fn parse(s: &'a str) -> Result<Option<Config>, Error> {
+        let s = match Self::remove_url_prefix(s) {
+            Some(s) => s,
+            None => return Ok(None),
+        };
+
+        let mut parser = UrlParser {
+            s,
+            config: Config::new(),
+        };
+
+        parser.parse_credentials()?;
+        parser.parse_host()?;
+        parser.parse_path()?;
+        parser.parse_params()?;
+
+        Ok(Some(parser.config))
+    }
+
+    fn remove_url_prefix(s: &str) -> Option<&str> {
+        for prefix in &["postgres://", "postgresql://"] {
+            if let Some(stripped) = s.strip_prefix(prefix) {
+                return Some(stripped);
+            }
+        }
+
+        None
+    }
+
+    fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
+        match self.s.find(end) {
+            Some(pos) => {
+                let (head, tail) = self.s.split_at(pos);
+                self.s = tail;
+                Some(head)
+            }
+            None => None,
+        }
+    }
+
+    fn take_all(&mut self) -> &'a str {
+        mem::take(&mut self.s)
+    }
+
+    fn eat_byte(&mut self) {
+        self.s = &self.s[1..];
+    }
+
+    fn parse_credentials(&mut self) -> Result<(), Error> {
+        let creds = match self.take_until(&['@']) {
+            Some(creds) => creds,
+            None => return Ok(()),
+        };
+        self.eat_byte();
+
+        let mut it = creds.splitn(2, ':');
+        let user = self.decode(it.next().unwrap())?;
+        self.config.user(&user);
+
+        if let Some(password) = it.next() {
+            let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
+            self.config.password(password);
+        }
+
+        Ok(())
+    }
+
+    fn parse_host(&mut self) -> Result<(), Error> {
+        let host = match self.take_until(&['/', '?']) {
+            Some(host) => host,
+            None => self.take_all(),
+        };
+
+        if host.is_empty() {
+            return Ok(());
+        }
+
+        for chunk in host.split(',') {
+            let (host, port) = if chunk.starts_with('[') {
+                let idx = match chunk.find(']') {
+                    Some(idx) => idx,
+                    None => return Err(Error::config_parse(InvalidValue("host").into())),
+                };
+
+                let host = &chunk[1..idx];
+                let remaining = &chunk[idx + 1..];
+                let port = if let Some(port) = remaining.strip_prefix(':') {
+                    Some(port)
+                } else if remaining.is_empty() {
+                    None
+                } else {
+                    return Err(Error::config_parse(InvalidValue("host").into()));
+                };
+
+                (host, port)
+            } else {
+                let mut it = chunk.splitn(2, ':');
+                (it.next().unwrap(), it.next())
+            };
+
+            self.host_param(host)?;
+            let port = self.decode(port.unwrap_or("5432"))?;
+            self.config.param("port", &port)?;
+        }
+
+        Ok(())
+    }
+
+    fn parse_path(&mut self) -> Result<(), Error> {
+        if !self.s.starts_with('/') {
+            return Ok(());
+        }
+        self.eat_byte();
+
+        let dbname = match self.take_until(&['?']) {
+            Some(dbname) => dbname,
+            None => self.take_all(),
+        };
+
+        if !dbname.is_empty() {
+            self.config.dbname(&self.decode(dbname)?);
+        }
+
+        Ok(())
+    }
+
+    fn parse_params(&mut self) -> Result<(), Error> {
+        if !self.s.starts_with('?') {
+            return Ok(());
+        }
+        self.eat_byte();
+
+        while !self.s.is_empty() {
+            let key = match self.take_until(&['=']) {
+                Some(key) => self.decode(key)?,
+                None => return Err(Error::config_parse("unterminated parameter".into())),
+            };
+            self.eat_byte();
+
+            let value = match self.take_until(&['&']) {
+                Some(value) => {
+                    self.eat_byte();
+                    value
+                }
+                None => self.take_all(),
+            };
+
+            if key == "host" {
+                self.host_param(value)?;
+            } else {
+                let value = self.decode(value)?;
+                self.config.param(&key, &value)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn host_param(&mut self, s: &str) -> Result<(), Error> {
+        let s = self.decode(s)?;
+        self.config.param("host", &s)
+    }
+
+    fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
+        percent_encoding::percent_decode(s.as_bytes())
+            .decode_utf8()
+            .map_err(|e| Error::config_parse(e.into()))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
new file mode 100644
index 0000000000..7517fe0cde
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -0,0 +1,112 @@
+use crate::client::SocketConfig;
+use crate::config::{Host, TargetSessionAttrs};
+use crate::connect_raw::connect_raw;
+use crate::connect_socket::connect_socket;
+use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
+use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use std::io;
+use std::task::Poll;
+use tokio::net::TcpStream;
+
+pub async fn connect<T>(
+    mut tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    if config.host.is_empty() {
+        return Err(Error::config("host missing".into()));
+    }
+
+    if config.port.len() > 1 && config.port.len() != config.host.len() {
+        return Err(Error::config("invalid number of ports".into()));
+    }
+
+    let mut error = None;
+    for (i, host) in config.host.iter().enumerate() {
+        let port = config
+            .port
+            .get(i)
+            .or_else(|| config.port.first())
+            .copied()
+            .unwrap_or(5432);
+
+        let hostname = match host {
+            Host::Tcp(host) => host.as_str(),
+        };
+
+        let tls = tls
+            .make_tls_connect(hostname)
+            .map_err(|e| Error::tls(e.into()))?;
+
+        match connect_once(host, port, tls, config).await {
+            Ok((client, connection)) => return Ok((client, connection)),
+            Err(e) => error = Some(e),
+        }
+    }
+
+    Err(error.unwrap())
+}
+
+async fn connect_once<T>(
+    host: &Host,
+    port: u16,
+    tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: TlsConnect<TcpStream>,
+{
+    let socket = connect_socket(host, port, config.connect_timeout).await?;
+    let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
+
+    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
+        let rows = client.simple_query_raw("SHOW transaction_read_only");
+        pin_mut!(rows);
+
+        let rows = future::poll_fn(|cx| {
+            if connection.poll_unpin(cx)?.is_ready() {
+                return Poll::Ready(Err(Error::closed()));
+            }
+
+            rows.as_mut().poll(cx)
+        })
+        .await?;
+        pin_mut!(rows);
+
+        loop {
+            let next = future::poll_fn(|cx| {
+                if connection.poll_unpin(cx)?.is_ready() {
+                    return Poll::Ready(Some(Err(Error::closed())));
+                }
+
+                rows.as_mut().poll_next(cx)
+            });
+
+            match next.await.transpose()? {
+                Some(SimpleQueryMessage::Row(row)) => {
+                    if row.try_get(0)? == Some("on") {
+                        return Err(Error::connect(io::Error::new(
+                            io::ErrorKind::PermissionDenied,
+                            "database does not allow writes",
+                        )));
+                    } else {
+                        break;
+                    }
+                }
+                Some(_) => {}
+                None => return Err(Error::unexpected_message()),
+            }
+        }
+    }
+
+    client.set_socket_config(SocketConfig {
+        host: host.clone(),
+        port,
+        connect_timeout: config.connect_timeout,
+    });
+
+    Ok((client, connection))
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
new file mode 100644
index 0000000000..80677af969
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -0,0 +1,359 @@
+use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::connect_tls::connect_tls;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::{TlsConnect, TlsStream};
+use crate::{Client, Connection, Error};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
+use postgres_protocol2::authentication;
+use postgres_protocol2::authentication::sasl;
+use postgres_protocol2::authentication::sasl::ScramSha256;
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
+use postgres_protocol2::message::frontend;
+use std::collections::{HashMap, VecDeque};
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc;
+use tokio_util::codec::Framed;
+
+pub struct StartupStream<S, T> {
+    inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    buf: BackendMessages,
+    delayed: VecDeque<BackendMessage>,
+}
+
+impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Error = io::Error;
+
+    fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_ready(cx)
+    }
+
+    fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> {
+        Pin::new(&mut self.inner).start_send(item)
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_flush(cx)
+    }
+
+    fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_close(cx)
+    }
+}
+
+impl<S, T> Stream for StartupStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Item = io::Result<Message>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<io::Result<Message>>> {
+        loop {
+            match self.buf.next() {
+                Ok(Some(message)) => return Poll::Ready(Some(Ok(message))),
+                Ok(None) => {}
+                Err(e) => return Poll::Ready(Some(Err(e))),
+            }
+
+            match ready!(Pin::new(&mut self.inner).poll_next(cx)) {
+                Some(Ok(BackendMessage::Normal { messages, .. })) => self.buf = messages,
+                Some(Ok(BackendMessage::Async(message))) => return Poll::Ready(Some(Ok(message))),
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                None => return Poll::Ready(None),
+            }
+        }
+    }
+}
+
+pub async fn connect_raw<S, T>(
+    stream: S,
+    tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<S, T::Stream>), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    let stream = connect_tls(stream, config.ssl_mode, tls).await?;
+
+    let mut stream = StartupStream {
+        inner: Framed::new(
+            stream,
+            PostgresCodec {
+                max_message_size: config.max_backend_message_size,
+            },
+        ),
+        buf: BackendMessages::empty(),
+        delayed: VecDeque::new(),
+    };
+
+    startup(&mut stream, config).await?;
+    authenticate(&mut stream, config).await?;
+    let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
+
+    let (sender, receiver) = mpsc::unbounded_channel();
+    let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
+    let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
+
+    Ok((client, connection))
+}
+
+async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut params = vec![("client_encoding", "UTF8")];
+    if let Some(user) = &config.user {
+        params.push(("user", &**user));
+    }
+    if let Some(dbname) = &config.dbname {
+        params.push(("database", &**dbname));
+    }
+    if let Some(options) = &config.options {
+        params.push(("options", &**options));
+    }
+    if let Some(application_name) = &config.application_name {
+        params.push(("application_name", &**application_name));
+    }
+    if let Some(replication_mode) = &config.replication_mode {
+        match replication_mode {
+            ReplicationMode::Physical => params.push(("replication", "true")),
+            ReplicationMode::Logical => params.push(("replication", "database")),
+        }
+    }
+
+    let mut buf = BytesMut::new();
+    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)
+}
+
+async fn authenticate<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationOk) => {
+            can_skip_channel_binding(config)?;
+            return Ok(());
+        }
+        Some(Message::AuthenticationCleartextPassword) => {
+            can_skip_channel_binding(config)?;
+
+            let pass = config
+                .password
+                .as_ref()
+                .ok_or_else(|| Error::config("password missing".into()))?;
+
+            authenticate_password(stream, pass).await?;
+        }
+        Some(Message::AuthenticationMd5Password(body)) => {
+            can_skip_channel_binding(config)?;
+
+            let user = config
+                .user
+                .as_ref()
+                .ok_or_else(|| Error::config("user missing".into()))?;
+            let pass = config
+                .password
+                .as_ref()
+                .ok_or_else(|| Error::config("password missing".into()))?;
+
+            let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
+            authenticate_password(stream, output.as_bytes()).await?;
+        }
+        Some(Message::AuthenticationSasl(body)) => {
+            authenticate_sasl(stream, body, config).await?;
+        }
+        Some(Message::AuthenticationKerberosV5)
+        | Some(Message::AuthenticationScmCredential)
+        | Some(Message::AuthenticationGss)
+        | Some(Message::AuthenticationSspi) => {
+            return Err(Error::authentication(
+                "unsupported authentication method".into(),
+            ))
+        }
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    }
+
+    match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationOk) => Ok(()),
+        Some(Message::ErrorResponse(body)) => Err(Error::db(body)),
+        Some(_) => Err(Error::unexpected_message()),
+        None => Err(Error::closed()),
+    }
+}
+
+fn can_skip_channel_binding(config: &Config) -> Result<(), Error> {
+    match config.channel_binding {
+        config::ChannelBinding::Disable | config::ChannelBinding::Prefer => Ok(()),
+        config::ChannelBinding::Require => Err(Error::authentication(
+            "server did not use channel binding".into(),
+        )),
+    }
+}
+
+async fn authenticate_password<S, T>(
+    stream: &mut StartupStream<S, T>,
+    password: &[u8],
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut buf = BytesMut::new();
+    frontend::password_message(password, &mut buf).map_err(Error::encode)?;
+
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)
+}
+
+async fn authenticate_sasl<S, T>(
+    stream: &mut StartupStream<S, T>,
+    body: AuthenticationSaslBody,
+    config: &Config,
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    let mut has_scram = false;
+    let mut has_scram_plus = false;
+    let mut mechanisms = body.mechanisms();
+    while let Some(mechanism) = mechanisms.next().map_err(Error::parse)? {
+        match mechanism {
+            sasl::SCRAM_SHA_256 => has_scram = true,
+            sasl::SCRAM_SHA_256_PLUS => has_scram_plus = true,
+            _ => {}
+        }
+    }
+
+    let channel_binding = stream
+        .inner
+        .get_ref()
+        .channel_binding()
+        .tls_server_end_point
+        .filter(|_| config.channel_binding != config::ChannelBinding::Disable)
+        .map(sasl::ChannelBinding::tls_server_end_point);
+
+    let (channel_binding, mechanism) = if has_scram_plus {
+        match channel_binding {
+            Some(channel_binding) => (channel_binding, sasl::SCRAM_SHA_256_PLUS),
+            None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256),
+        }
+    } else if has_scram {
+        match channel_binding {
+            Some(_) => (sasl::ChannelBinding::unrequested(), sasl::SCRAM_SHA_256),
+            None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256),
+        }
+    } else {
+        return Err(Error::authentication("unsupported SASL mechanism".into()));
+    };
+
+    if mechanism != sasl::SCRAM_SHA_256_PLUS {
+        can_skip_channel_binding(config)?;
+    }
+
+    let mut scram = if let Some(AuthKeys::ScramSha256(keys)) = config.get_auth_keys() {
+        ScramSha256::new_with_keys(keys, channel_binding)
+    } else if let Some(password) = config.get_password() {
+        ScramSha256::new(password, channel_binding)
+    } else {
+        return Err(Error::config("password or auth keys missing".into()));
+    };
+
+    let mut buf = BytesMut::new();
+    frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?;
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)?;
+
+    let body = match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationSaslContinue(body)) => body,
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    };
+
+    scram
+        .update(body.data())
+        .await
+        .map_err(|e| Error::authentication(e.into()))?;
+
+    let mut buf = BytesMut::new();
+    frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?;
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)?;
+
+    let body = match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationSaslFinal(body)) => body,
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    };
+
+    scram
+        .finish(body.data())
+        .map_err(|e| Error::authentication(e.into()))?;
+
+    Ok(())
+}
+
+async fn read_info<S, T>(
+    stream: &mut StartupStream<S, T>,
+) -> Result<(i32, i32, HashMap<String, String>), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut process_id = 0;
+    let mut secret_key = 0;
+    let mut parameters = HashMap::new();
+
+    loop {
+        match stream.try_next().await.map_err(Error::io)? {
+            Some(Message::BackendKeyData(body)) => {
+                process_id = body.process_id();
+                secret_key = body.secret_key();
+            }
+            Some(Message::ParameterStatus(body)) => {
+                parameters.insert(
+                    body.name().map_err(Error::parse)?.to_string(),
+                    body.value().map_err(Error::parse)?.to_string(),
+                );
+            }
+            Some(msg @ Message::NoticeResponse(_)) => {
+                stream.delayed.push_back(BackendMessage::Async(msg))
+            }
+            Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
+            Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+            Some(_) => return Err(Error::unexpected_message()),
+            None => return Err(Error::closed()),
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs
new file mode 100644
index 0000000000..336a13317f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -0,0 +1,65 @@
+use crate::config::Host;
+use crate::Error;
+use std::future::Future;
+use std::io;
+use std::time::Duration;
+use tokio::net::{self, TcpStream};
+use tokio::time;
+
+pub(crate) async fn connect_socket(
+    host: &Host,
+    port: u16,
+    connect_timeout: Option<Duration>,
+) -> Result<TcpStream, Error> {
+    match host {
+        Host::Tcp(host) => {
+            let addrs = net::lookup_host((&**host, port))
+                .await
+                .map_err(Error::connect)?;
+
+            let mut last_err = None;
+
+            for addr in addrs {
+                let stream =
+                    match connect_with_timeout(TcpStream::connect(addr), connect_timeout).await {
+                        Ok(stream) => stream,
+                        Err(e) => {
+                            last_err = Some(e);
+                            continue;
+                        }
+                    };
+
+                stream.set_nodelay(true).map_err(Error::connect)?;
+
+                return Ok(stream);
+            }
+
+            Err(last_err.unwrap_or_else(|| {
+                Error::connect(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "could not resolve any addresses",
+                ))
+            }))
+        }
+    }
+}
+
+async fn connect_with_timeout<F, T>(connect: F, timeout: Option<Duration>) -> Result<T, Error>
+where
+    F: Future<Output = io::Result<T>>,
+{
+    match timeout {
+        Some(timeout) => match time::timeout(timeout, connect).await {
+            Ok(Ok(socket)) => Ok(socket),
+            Ok(Err(e)) => Err(Error::connect(e)),
+            Err(_) => Err(Error::connect(io::Error::new(
+                io::ErrorKind::TimedOut,
+                "connection timed out",
+            ))),
+        },
+        None => match connect.await {
+            Ok(socket) => Ok(socket),
+            Err(e) => Err(Error::connect(e)),
+        },
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs
new file mode 100644
index 0000000000..64b0b68abc
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs
@@ -0,0 +1,48 @@
+use crate::config::SslMode;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::private::ForcePrivateApi;
+use crate::tls::TlsConnect;
+use crate::Error;
+use bytes::BytesMut;
+use postgres_protocol2::message::frontend;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+
+pub async fn connect_tls<S, T>(
+    mut stream: S,
+    mode: SslMode,
+    tls: T,
+) -> Result<MaybeTlsStream<S, T::Stream>, Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    match mode {
+        SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)),
+        SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => {
+            return Ok(MaybeTlsStream::Raw(stream))
+        }
+        SslMode::Prefer | SslMode::Require => {}
+    }
+
+    let mut buf = BytesMut::new();
+    frontend::ssl_request(&mut buf);
+    stream.write_all(&buf).await.map_err(Error::io)?;
+
+    let mut buf = [0];
+    stream.read_exact(&mut buf).await.map_err(Error::io)?;
+
+    if buf[0] != b'S' {
+        if SslMode::Require == mode {
+            return Err(Error::tls("server does not support TLS".into()));
+        } else {
+            return Ok(MaybeTlsStream::Raw(stream));
+        }
+    }
+
+    let stream = tls
+        .connect(stream)
+        .await
+        .map_err(|e| Error::tls(e.into()))?;
+
+    Ok(MaybeTlsStream::Tls(stream))
+}
diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs
new file mode 100644
index 0000000000..0aa5c77e22
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -0,0 +1,323 @@
+use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::error::DbError;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::{AsyncMessage, Error, Notification};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Sink, Stream};
+use log::{info, trace};
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::collections::{HashMap, VecDeque};
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc;
+use tokio_util::codec::Framed;
+use tokio_util::sync::PollSender;
+
+pub enum RequestMessages {
+    Single(FrontendMessage),
+}
+
+pub struct Request {
+    pub messages: RequestMessages,
+    pub sender: mpsc::Sender<BackendMessages>,
+}
+
+pub struct Response {
+    sender: PollSender<BackendMessages>,
+}
+
+#[derive(PartialEq, Debug)]
+enum State {
+    Active,
+    Terminating,
+    Closing,
+}
+
+/// A connection to a PostgreSQL database.
+///
+/// This is one half of what is returned when a new connection is established. It performs the actual IO with the
+/// server, and should generally be spawned off onto an executor to run in the background.
+///
+/// `Connection` implements `Future`, and only resolves when the connection is closed, either because a fatal error has
+/// occurred, or because its associated `Client` has dropped and all outstanding work has completed.
+#[must_use = "futures do nothing unless polled"]
+pub struct Connection<S, T> {
+    /// HACK: we need this in the Neon Proxy.
+    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    /// HACK: we need this in the Neon Proxy to forward params.
+    pub parameters: HashMap<String, String>,
+    receiver: mpsc::UnboundedReceiver<Request>,
+    pending_request: Option<RequestMessages>,
+    pending_responses: VecDeque<BackendMessage>,
+    responses: VecDeque<Response>,
+    state: State,
+}
+
+impl<S, T> Connection<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    pub(crate) fn new(
+        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+        pending_responses: VecDeque<BackendMessage>,
+        parameters: HashMap<String, String>,
+        receiver: mpsc::UnboundedReceiver<Request>,
+    ) -> Connection<S, T> {
+        Connection {
+            stream,
+            parameters,
+            receiver,
+            pending_request: None,
+            pending_responses,
+            responses: VecDeque::new(),
+            state: State::Active,
+        }
+    }
+
+    fn poll_response(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<BackendMessage, Error>>> {
+        if let Some(message) = self.pending_responses.pop_front() {
+            trace!("retrying pending response");
+            return Poll::Ready(Some(Ok(message)));
+        }
+
+        Pin::new(&mut self.stream)
+            .poll_next(cx)
+            .map(|o| o.map(|r| r.map_err(Error::io)))
+    }
+
+    fn poll_read(&mut self, cx: &mut Context<'_>) -> Result<Option<AsyncMessage>, Error> {
+        if self.state != State::Active {
+            trace!("poll_read: done");
+            return Ok(None);
+        }
+
+        loop {
+            let message = match self.poll_response(cx)? {
+                Poll::Ready(Some(message)) => message,
+                Poll::Ready(None) => return Err(Error::closed()),
+                Poll::Pending => {
+                    trace!("poll_read: waiting on response");
+                    return Ok(None);
+                }
+            };
+
+            let (mut messages, request_complete) = match message {
+                BackendMessage::Async(Message::NoticeResponse(body)) => {
+                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
+                    return Ok(Some(AsyncMessage::Notice(error)));
+                }
+                BackendMessage::Async(Message::NotificationResponse(body)) => {
+                    let notification = Notification {
+                        process_id: body.process_id(),
+                        channel: body.channel().map_err(Error::parse)?.to_string(),
+                        payload: body.message().map_err(Error::parse)?.to_string(),
+                    };
+                    return Ok(Some(AsyncMessage::Notification(notification)));
+                }
+                BackendMessage::Async(Message::ParameterStatus(body)) => {
+                    self.parameters.insert(
+                        body.name().map_err(Error::parse)?.to_string(),
+                        body.value().map_err(Error::parse)?.to_string(),
+                    );
+                    continue;
+                }
+                BackendMessage::Async(_) => unreachable!(),
+                BackendMessage::Normal {
+                    messages,
+                    request_complete,
+                } => (messages, request_complete),
+            };
+
+            let mut response = match self.responses.pop_front() {
+                Some(response) => response,
+                None => match messages.next().map_err(Error::parse)? {
+                    Some(Message::ErrorResponse(error)) => return Err(Error::db(error)),
+                    _ => return Err(Error::unexpected_message()),
+                },
+            };
+
+            match response.sender.poll_reserve(cx) {
+                Poll::Ready(Ok(())) => {
+                    let _ = response.sender.send_item(messages);
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
+                }
+                Poll::Ready(Err(_)) => {
+                    // we need to keep paging through the rest of the messages even if the receiver's hung up
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
+                }
+                Poll::Pending => {
+                    self.responses.push_front(response);
+                    self.pending_responses.push_back(BackendMessage::Normal {
+                        messages,
+                        request_complete,
+                    });
+                    trace!("poll_read: waiting on sender");
+                    return Ok(None);
+                }
+            }
+        }
+    }
+
+    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
+        if let Some(messages) = self.pending_request.take() {
+            trace!("retrying pending request");
+            return Poll::Ready(Some(messages));
+        }
+
+        if self.receiver.is_closed() {
+            return Poll::Ready(None);
+        }
+
+        match self.receiver.poll_recv(cx) {
+            Poll::Ready(Some(request)) => {
+                trace!("polled new request");
+                self.responses.push_back(Response {
+                    sender: PollSender::new(request.sender),
+                });
+                Poll::Ready(Some(request.messages))
+            }
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Result<bool, Error> {
+        loop {
+            if self.state == State::Closing {
+                trace!("poll_write: done");
+                return Ok(false);
+            }
+
+            if Pin::new(&mut self.stream)
+                .poll_ready(cx)
+                .map_err(Error::io)?
+                .is_pending()
+            {
+                trace!("poll_write: waiting on socket");
+                return Ok(false);
+            }
+
+            let request = match self.poll_request(cx) {
+                Poll::Ready(Some(request)) => request,
+                Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => {
+                    trace!("poll_write: at eof, terminating");
+                    self.state = State::Terminating;
+                    let mut request = BytesMut::new();
+                    frontend::terminate(&mut request);
+                    RequestMessages::Single(FrontendMessage::Raw(request.freeze()))
+                }
+                Poll::Ready(None) => {
+                    trace!(
+                        "poll_write: at eof, pending responses {}",
+                        self.responses.len()
+                    );
+                    return Ok(true);
+                }
+                Poll::Pending => {
+                    trace!("poll_write: waiting on request");
+                    return Ok(true);
+                }
+            };
+
+            match request {
+                RequestMessages::Single(request) => {
+                    Pin::new(&mut self.stream)
+                        .start_send(request)
+                        .map_err(Error::io)?;
+                    if self.state == State::Terminating {
+                        trace!("poll_write: sent eof, closing");
+                        self.state = State::Closing;
+                    }
+                }
+            }
+        }
+    }
+
+    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> {
+        match Pin::new(&mut self.stream)
+            .poll_flush(cx)
+            .map_err(Error::io)?
+        {
+            Poll::Ready(()) => trace!("poll_flush: flushed"),
+            Poll::Pending => trace!("poll_flush: waiting on socket"),
+        }
+        Ok(())
+    }
+
+    fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+        if self.state != State::Closing {
+            return Poll::Pending;
+        }
+
+        match Pin::new(&mut self.stream)
+            .poll_close(cx)
+            .map_err(Error::io)?
+        {
+            Poll::Ready(()) => {
+                trace!("poll_shutdown: complete");
+                Poll::Ready(Ok(()))
+            }
+            Poll::Pending => {
+                trace!("poll_shutdown: waiting on socket");
+                Poll::Pending
+            }
+        }
+    }
+
+    /// Returns the value of a runtime parameter for this connection.
+    pub fn parameter(&self, name: &str) -> Option<&str> {
+        self.parameters.get(name).map(|s| &**s)
+    }
+
+    /// Polls for asynchronous messages from the server.
+    ///
+    /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to
+    /// examine those messages should use this method to drive the connection rather than its `Future` implementation.
+    pub fn poll_message(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<AsyncMessage, Error>>> {
+        let message = self.poll_read(cx)?;
+        let want_flush = self.poll_write(cx)?;
+        if want_flush {
+            self.poll_flush(cx)?;
+        }
+        match message {
+            Some(message) => Poll::Ready(Some(Ok(message))),
+            None => match self.poll_shutdown(cx) {
+                Poll::Ready(Ok(())) => Poll::Ready(None),
+                Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
+                Poll::Pending => Poll::Pending,
+            },
+        }
+    }
+}
+
+impl<S, T> Future for Connection<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Output = Result<(), Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+        while let Some(message) = ready!(self.poll_message(cx)?) {
+            if let AsyncMessage::Notice(notice) = message {
+                info!("{}: {}", notice.severity(), notice.message());
+            }
+        }
+        Poll::Ready(Ok(()))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
new file mode 100644
index 0000000000..6514322250
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -0,0 +1,501 @@
+//! Errors.
+
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
+use std::error::{self, Error as _Error};
+use std::fmt;
+use std::io;
+
+pub use self::sqlstate::*;
+
+#[allow(clippy::unreadable_literal)]
+mod sqlstate;
+
+/// The severity of a Postgres error or notice.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum Severity {
+    /// PANIC
+    Panic,
+    /// FATAL
+    Fatal,
+    /// ERROR
+    Error,
+    /// WARNING
+    Warning,
+    /// NOTICE
+    Notice,
+    /// DEBUG
+    Debug,
+    /// INFO
+    Info,
+    /// LOG
+    Log,
+}
+
+impl fmt::Display for Severity {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match *self {
+            Severity::Panic => "PANIC",
+            Severity::Fatal => "FATAL",
+            Severity::Error => "ERROR",
+            Severity::Warning => "WARNING",
+            Severity::Notice => "NOTICE",
+            Severity::Debug => "DEBUG",
+            Severity::Info => "INFO",
+            Severity::Log => "LOG",
+        };
+        fmt.write_str(s)
+    }
+}
+
+impl Severity {
+    fn from_str(s: &str) -> Option<Severity> {
+        match s {
+            "PANIC" => Some(Severity::Panic),
+            "FATAL" => Some(Severity::Fatal),
+            "ERROR" => Some(Severity::Error),
+            "WARNING" => Some(Severity::Warning),
+            "NOTICE" => Some(Severity::Notice),
+            "DEBUG" => Some(Severity::Debug),
+            "INFO" => Some(Severity::Info),
+            "LOG" => Some(Severity::Log),
+            _ => None,
+        }
+    }
+}
+
+/// A Postgres error or notice.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DbError {
+    severity: String,
+    parsed_severity: Option<Severity>,
+    code: SqlState,
+    message: String,
+    detail: Option<String>,
+    hint: Option<String>,
+    position: Option<ErrorPosition>,
+    where_: Option<String>,
+    schema: Option<String>,
+    table: Option<String>,
+    column: Option<String>,
+    datatype: Option<String>,
+    constraint: Option<String>,
+    file: Option<String>,
+    line: Option<u32>,
+    routine: Option<String>,
+}
+
+impl DbError {
+    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
+        let mut severity = None;
+        let mut parsed_severity = None;
+        let mut code = None;
+        let mut message = None;
+        let mut detail = None;
+        let mut hint = None;
+        let mut normal_position = None;
+        let mut internal_position = None;
+        let mut internal_query = None;
+        let mut where_ = None;
+        let mut schema = None;
+        let mut table = None;
+        let mut column = None;
+        let mut datatype = None;
+        let mut constraint = None;
+        let mut file = None;
+        let mut line = None;
+        let mut routine = None;
+
+        while let Some(field) = fields.next()? {
+            match field.type_() {
+                b'S' => severity = Some(field.value().to_owned()),
+                b'C' => code = Some(SqlState::from_code(field.value())),
+                b'M' => message = Some(field.value().to_owned()),
+                b'D' => detail = Some(field.value().to_owned()),
+                b'H' => hint = Some(field.value().to_owned()),
+                b'P' => {
+                    normal_position = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`P` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'p' => {
+                    internal_position = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`p` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'q' => internal_query = Some(field.value().to_owned()),
+                b'W' => where_ = Some(field.value().to_owned()),
+                b's' => schema = Some(field.value().to_owned()),
+                b't' => table = Some(field.value().to_owned()),
+                b'c' => column = Some(field.value().to_owned()),
+                b'd' => datatype = Some(field.value().to_owned()),
+                b'n' => constraint = Some(field.value().to_owned()),
+                b'F' => file = Some(field.value().to_owned()),
+                b'L' => {
+                    line = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`L` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'R' => routine = Some(field.value().to_owned()),
+                b'V' => {
+                    parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`V` field contained an invalid value",
+                        )
+                    })?);
+                }
+                _ => {}
+            }
+        }
+
+        Ok(DbError {
+            severity: severity
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?,
+            parsed_severity,
+            code: code
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?,
+            message: message
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?,
+            detail,
+            hint,
+            position: match normal_position {
+                Some(position) => Some(ErrorPosition::Original(position)),
+                None => match internal_position {
+                    Some(position) => Some(ErrorPosition::Internal {
+                        position,
+                        query: internal_query.ok_or_else(|| {
+                            io::Error::new(
+                                io::ErrorKind::InvalidInput,
+                                "`q` field missing but `p` field present",
+                            )
+                        })?,
+                    }),
+                    None => None,
+                },
+            },
+            where_,
+            schema,
+            table,
+            column,
+            datatype,
+            constraint,
+            file,
+            line,
+            routine,
+        })
+    }
+
+    /// The field contents are ERROR, FATAL, or PANIC (in an error message),
+    /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a
+    /// localized translation of one of these.
+    pub fn severity(&self) -> &str {
+        &self.severity
+    }
+
+    /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+)
+    pub fn parsed_severity(&self) -> Option<Severity> {
+        self.parsed_severity
+    }
+
+    /// The SQLSTATE code for the error.
+    pub fn code(&self) -> &SqlState {
+        &self.code
+    }
+
+    /// The primary human-readable error message.
+    ///
+    /// This should be accurate but terse (typically one line).
+    pub fn message(&self) -> &str {
+        &self.message
+    }
+
+    /// An optional secondary error message carrying more detail about the
+    /// problem.
+    ///
+    /// Might run to multiple lines.
+    pub fn detail(&self) -> Option<&str> {
+        self.detail.as_deref()
+    }
+
+    /// An optional suggestion what to do about the problem.
+    ///
+    /// This is intended to differ from `detail` in that it offers advice
+    /// (potentially inappropriate) rather than hard facts. Might run to
+    /// multiple lines.
+    pub fn hint(&self) -> Option<&str> {
+        self.hint.as_deref()
+    }
+
+    /// An optional error cursor position into either the original query string
+    /// or an internally generated query.
+    pub fn position(&self) -> Option<&ErrorPosition> {
+        self.position.as_ref()
+    }
+
+    /// An indication of the context in which the error occurred.
+    ///
+    /// Presently this includes a call stack traceback of active procedural
+    /// language functions and internally-generated queries. The trace is one
+    /// entry per line, most recent first.
+    pub fn where_(&self) -> Option<&str> {
+        self.where_.as_deref()
+    }
+
+    /// If the error was associated with a specific database object, the name
+    /// of the schema containing that object, if any. (PostgreSQL 9.3+)
+    pub fn schema(&self) -> Option<&str> {
+        self.schema.as_deref()
+    }
+
+    /// If the error was associated with a specific table, the name of the
+    /// table. (Refer to the schema name field for the name of the table's
+    /// schema.) (PostgreSQL 9.3+)
+    pub fn table(&self) -> Option<&str> {
+        self.table.as_deref()
+    }
+
+    /// If the error was associated with a specific table column, the name of
+    /// the column.
+    ///
+    /// (Refer to the schema and table name fields to identify the table.)
+    /// (PostgreSQL 9.3+)
+    pub fn column(&self) -> Option<&str> {
+        self.column.as_deref()
+    }
+
+    /// If the error was associated with a specific data type, the name of the
+    /// data type. (Refer to the schema name field for the name of the data
+    /// type's schema.) (PostgreSQL 9.3+)
+    pub fn datatype(&self) -> Option<&str> {
+        self.datatype.as_deref()
+    }
+
+    /// If the error was associated with a specific constraint, the name of the
+    /// constraint.
+    ///
+    /// Refer to fields listed above for the associated table or domain.
+    /// (For this purpose, indexes are treated as constraints, even if they
+    /// weren't created with constraint syntax.) (PostgreSQL 9.3+)
+    pub fn constraint(&self) -> Option<&str> {
+        self.constraint.as_deref()
+    }
+
+    /// The file name of the source-code location where the error was reported.
+    pub fn file(&self) -> Option<&str> {
+        self.file.as_deref()
+    }
+
+    /// The line number of the source-code location where the error was
+    /// reported.
+    pub fn line(&self) -> Option<u32> {
+        self.line
+    }
+
+    /// The name of the source-code routine reporting the error.
+    pub fn routine(&self) -> Option<&str> {
+        self.routine.as_deref()
+    }
+}
+
+impl fmt::Display for DbError {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "{}: {}", self.severity, self.message)?;
+        if let Some(detail) = &self.detail {
+            write!(fmt, "\nDETAIL: {}", detail)?;
+        }
+        if let Some(hint) = &self.hint {
+            write!(fmt, "\nHINT: {}", hint)?;
+        }
+        Ok(())
+    }
+}
+
+impl error::Error for DbError {}
+
+/// Represents the position of an error in a query.
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub enum ErrorPosition {
+    /// A position in the original query.
+    Original(u32),
+    /// A position in an internally generated query.
+    Internal {
+        /// The byte position.
+        position: u32,
+        /// A query generated by the Postgres server.
+        query: String,
+    },
+}
+
+#[derive(Debug, PartialEq)]
+enum Kind {
+    Io,
+    UnexpectedMessage,
+    Tls,
+    ToSql(usize),
+    FromSql(usize),
+    Column(String),
+    Closed,
+    Db,
+    Parse,
+    Encode,
+    Authentication,
+    ConfigParse,
+    Config,
+    Connect,
+    Timeout,
+}
+
+struct ErrorInner {
+    kind: Kind,
+    cause: Option<Box<dyn error::Error + Sync + Send>>,
+}
+
+/// An error communicating with the Postgres server.
+pub struct Error(Box<ErrorInner>);
+
+impl fmt::Debug for Error {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.debug_struct("Error")
+            .field("kind", &self.0.kind)
+            .field("cause", &self.0.cause)
+            .finish()
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match &self.0.kind {
+            Kind::Io => fmt.write_str("error communicating with the server")?,
+            Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?,
+            Kind::Tls => fmt.write_str("error performing TLS handshake")?,
+            Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?,
+            Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?,
+            Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?,
+            Kind::Closed => fmt.write_str("connection closed")?,
+            Kind::Db => fmt.write_str("db error")?,
+            Kind::Parse => fmt.write_str("error parsing response from server")?,
+            Kind::Encode => fmt.write_str("error encoding message to server")?,
+            Kind::Authentication => fmt.write_str("authentication error")?,
+            Kind::ConfigParse => fmt.write_str("invalid connection string")?,
+            Kind::Config => fmt.write_str("invalid configuration")?,
+            Kind::Connect => fmt.write_str("error connecting to server")?,
+            Kind::Timeout => fmt.write_str("timeout waiting for server")?,
+        };
+        if let Some(ref cause) = self.0.cause {
+            write!(fmt, ": {}", cause)?;
+        }
+        Ok(())
+    }
+}
+
+impl error::Error for Error {
+    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
+        self.0.cause.as_ref().map(|e| &**e as _)
+    }
+}
+
+impl Error {
+    /// Consumes the error, returning its cause.
+    pub fn into_source(self) -> Option<Box<dyn error::Error + Sync + Send>> {
+        self.0.cause
+    }
+
+    /// Returns the source of this error if it was a `DbError`.
+    ///
+    /// This is a simple convenience method.
+    pub fn as_db_error(&self) -> Option<&DbError> {
+        self.source().and_then(|e| e.downcast_ref::<DbError>())
+    }
+
+    /// Determines if the error was associated with closed connection.
+    pub fn is_closed(&self) -> bool {
+        self.0.kind == Kind::Closed
+    }
+
+    /// Returns the SQLSTATE error code associated with the error.
+    ///
+    /// This is a convenience method that downcasts the cause to a `DbError` and returns its code.
+    pub fn code(&self) -> Option<&SqlState> {
+        self.as_db_error().map(DbError::code)
+    }
+
+    fn new(kind: Kind, cause: Option<Box<dyn error::Error + Sync + Send>>) -> Error {
+        Error(Box::new(ErrorInner { kind, cause }))
+    }
+
+    pub(crate) fn closed() -> Error {
+        Error::new(Kind::Closed, None)
+    }
+
+    pub(crate) fn unexpected_message() -> Error {
+        Error::new(Kind::UnexpectedMessage, None)
+    }
+
+    #[allow(clippy::needless_pass_by_value)]
+    pub(crate) fn db(error: ErrorResponseBody) -> Error {
+        match DbError::parse(&mut error.fields()) {
+            Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
+            Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
+        }
+    }
+
+    pub(crate) fn parse(e: io::Error) -> Error {
+        Error::new(Kind::Parse, Some(Box::new(e)))
+    }
+
+    pub(crate) fn encode(e: io::Error) -> Error {
+        Error::new(Kind::Encode, Some(Box::new(e)))
+    }
+
+    #[allow(clippy::wrong_self_convention)]
+    pub(crate) fn to_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
+        Error::new(Kind::ToSql(idx), Some(e))
+    }
+
+    pub(crate) fn from_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
+        Error::new(Kind::FromSql(idx), Some(e))
+    }
+
+    pub(crate) fn column(column: String) -> Error {
+        Error::new(Kind::Column(column), None)
+    }
+
+    pub(crate) fn tls(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Tls, Some(e))
+    }
+
+    pub(crate) fn io(e: io::Error) -> Error {
+        Error::new(Kind::Io, Some(Box::new(e)))
+    }
+
+    pub(crate) fn authentication(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Authentication, Some(e))
+    }
+
+    pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::ConfigParse, Some(e))
+    }
+
+    pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Config, Some(e))
+    }
+
+    pub(crate) fn connect(e: io::Error) -> Error {
+        Error::new(Kind::Connect, Some(Box::new(e)))
+    }
+
+    #[doc(hidden)]
+    pub fn __private_api_timeout() -> Error {
+        Error::new(Kind::Timeout, None)
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
new file mode 100644
index 0000000000..13a1d75f95
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
@@ -0,0 +1,1670 @@
+// Autogenerated file - DO NOT EDIT
+
+/// A SQLSTATE error code
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct SqlState(Inner);
+
+impl SqlState {
+    /// Creates a `SqlState` from its error code.
+    pub fn from_code(s: &str) -> SqlState {
+        match SQLSTATE_MAP.get(s) {
+            Some(state) => state.clone(),
+            None => SqlState(Inner::Other(s.into())),
+        }
+    }
+
+    /// Returns the error code corresponding to the `SqlState`.
+    pub fn code(&self) -> &str {
+        match &self.0 {
+            Inner::E00000 => "00000",
+            Inner::E01000 => "01000",
+            Inner::E0100C => "0100C",
+            Inner::E01008 => "01008",
+            Inner::E01003 => "01003",
+            Inner::E01007 => "01007",
+            Inner::E01006 => "01006",
+            Inner::E01004 => "01004",
+            Inner::E01P01 => "01P01",
+            Inner::E02000 => "02000",
+            Inner::E02001 => "02001",
+            Inner::E03000 => "03000",
+            Inner::E08000 => "08000",
+            Inner::E08003 => "08003",
+            Inner::E08006 => "08006",
+            Inner::E08001 => "08001",
+            Inner::E08004 => "08004",
+            Inner::E08007 => "08007",
+            Inner::E08P01 => "08P01",
+            Inner::E09000 => "09000",
+            Inner::E0A000 => "0A000",
+            Inner::E0B000 => "0B000",
+            Inner::E0F000 => "0F000",
+            Inner::E0F001 => "0F001",
+            Inner::E0L000 => "0L000",
+            Inner::E0LP01 => "0LP01",
+            Inner::E0P000 => "0P000",
+            Inner::E0Z000 => "0Z000",
+            Inner::E0Z002 => "0Z002",
+            Inner::E20000 => "20000",
+            Inner::E21000 => "21000",
+            Inner::E22000 => "22000",
+            Inner::E2202E => "2202E",
+            Inner::E22021 => "22021",
+            Inner::E22008 => "22008",
+            Inner::E22012 => "22012",
+            Inner::E22005 => "22005",
+            Inner::E2200B => "2200B",
+            Inner::E22022 => "22022",
+            Inner::E22015 => "22015",
+            Inner::E2201E => "2201E",
+            Inner::E22014 => "22014",
+            Inner::E22016 => "22016",
+            Inner::E2201F => "2201F",
+            Inner::E2201G => "2201G",
+            Inner::E22018 => "22018",
+            Inner::E22007 => "22007",
+            Inner::E22019 => "22019",
+            Inner::E2200D => "2200D",
+            Inner::E22025 => "22025",
+            Inner::E22P06 => "22P06",
+            Inner::E22010 => "22010",
+            Inner::E22023 => "22023",
+            Inner::E22013 => "22013",
+            Inner::E2201B => "2201B",
+            Inner::E2201W => "2201W",
+            Inner::E2201X => "2201X",
+            Inner::E2202H => "2202H",
+            Inner::E2202G => "2202G",
+            Inner::E22009 => "22009",
+            Inner::E2200C => "2200C",
+            Inner::E2200G => "2200G",
+            Inner::E22004 => "22004",
+            Inner::E22002 => "22002",
+            Inner::E22003 => "22003",
+            Inner::E2200H => "2200H",
+            Inner::E22026 => "22026",
+            Inner::E22001 => "22001",
+            Inner::E22011 => "22011",
+            Inner::E22027 => "22027",
+            Inner::E22024 => "22024",
+            Inner::E2200F => "2200F",
+            Inner::E22P01 => "22P01",
+            Inner::E22P02 => "22P02",
+            Inner::E22P03 => "22P03",
+            Inner::E22P04 => "22P04",
+            Inner::E22P05 => "22P05",
+            Inner::E2200L => "2200L",
+            Inner::E2200M => "2200M",
+            Inner::E2200N => "2200N",
+            Inner::E2200S => "2200S",
+            Inner::E2200T => "2200T",
+            Inner::E22030 => "22030",
+            Inner::E22031 => "22031",
+            Inner::E22032 => "22032",
+            Inner::E22033 => "22033",
+            Inner::E22034 => "22034",
+            Inner::E22035 => "22035",
+            Inner::E22036 => "22036",
+            Inner::E22037 => "22037",
+            Inner::E22038 => "22038",
+            Inner::E22039 => "22039",
+            Inner::E2203A => "2203A",
+            Inner::E2203B => "2203B",
+            Inner::E2203C => "2203C",
+            Inner::E2203D => "2203D",
+            Inner::E2203E => "2203E",
+            Inner::E2203F => "2203F",
+            Inner::E2203G => "2203G",
+            Inner::E23000 => "23000",
+            Inner::E23001 => "23001",
+            Inner::E23502 => "23502",
+            Inner::E23503 => "23503",
+            Inner::E23505 => "23505",
+            Inner::E23514 => "23514",
+            Inner::E23P01 => "23P01",
+            Inner::E24000 => "24000",
+            Inner::E25000 => "25000",
+            Inner::E25001 => "25001",
+            Inner::E25002 => "25002",
+            Inner::E25008 => "25008",
+            Inner::E25003 => "25003",
+            Inner::E25004 => "25004",
+            Inner::E25005 => "25005",
+            Inner::E25006 => "25006",
+            Inner::E25007 => "25007",
+            Inner::E25P01 => "25P01",
+            Inner::E25P02 => "25P02",
+            Inner::E25P03 => "25P03",
+            Inner::E26000 => "26000",
+            Inner::E27000 => "27000",
+            Inner::E28000 => "28000",
+            Inner::E28P01 => "28P01",
+            Inner::E2B000 => "2B000",
+            Inner::E2BP01 => "2BP01",
+            Inner::E2D000 => "2D000",
+            Inner::E2F000 => "2F000",
+            Inner::E2F005 => "2F005",
+            Inner::E2F002 => "2F002",
+            Inner::E2F003 => "2F003",
+            Inner::E2F004 => "2F004",
+            Inner::E34000 => "34000",
+            Inner::E38000 => "38000",
+            Inner::E38001 => "38001",
+            Inner::E38002 => "38002",
+            Inner::E38003 => "38003",
+            Inner::E38004 => "38004",
+            Inner::E39000 => "39000",
+            Inner::E39001 => "39001",
+            Inner::E39004 => "39004",
+            Inner::E39P01 => "39P01",
+            Inner::E39P02 => "39P02",
+            Inner::E39P03 => "39P03",
+            Inner::E3B000 => "3B000",
+            Inner::E3B001 => "3B001",
+            Inner::E3D000 => "3D000",
+            Inner::E3F000 => "3F000",
+            Inner::E40000 => "40000",
+            Inner::E40002 => "40002",
+            Inner::E40001 => "40001",
+            Inner::E40003 => "40003",
+            Inner::E40P01 => "40P01",
+            Inner::E42000 => "42000",
+            Inner::E42601 => "42601",
+            Inner::E42501 => "42501",
+            Inner::E42846 => "42846",
+            Inner::E42803 => "42803",
+            Inner::E42P20 => "42P20",
+            Inner::E42P19 => "42P19",
+            Inner::E42830 => "42830",
+            Inner::E42602 => "42602",
+            Inner::E42622 => "42622",
+            Inner::E42939 => "42939",
+            Inner::E42804 => "42804",
+            Inner::E42P18 => "42P18",
+            Inner::E42P21 => "42P21",
+            Inner::E42P22 => "42P22",
+            Inner::E42809 => "42809",
+            Inner::E428C9 => "428C9",
+            Inner::E42703 => "42703",
+            Inner::E42883 => "42883",
+            Inner::E42P01 => "42P01",
+            Inner::E42P02 => "42P02",
+            Inner::E42704 => "42704",
+            Inner::E42701 => "42701",
+            Inner::E42P03 => "42P03",
+            Inner::E42P04 => "42P04",
+            Inner::E42723 => "42723",
+            Inner::E42P05 => "42P05",
+            Inner::E42P06 => "42P06",
+            Inner::E42P07 => "42P07",
+            Inner::E42712 => "42712",
+            Inner::E42710 => "42710",
+            Inner::E42702 => "42702",
+            Inner::E42725 => "42725",
+            Inner::E42P08 => "42P08",
+            Inner::E42P09 => "42P09",
+            Inner::E42P10 => "42P10",
+            Inner::E42611 => "42611",
+            Inner::E42P11 => "42P11",
+            Inner::E42P12 => "42P12",
+            Inner::E42P13 => "42P13",
+            Inner::E42P14 => "42P14",
+            Inner::E42P15 => "42P15",
+            Inner::E42P16 => "42P16",
+            Inner::E42P17 => "42P17",
+            Inner::E44000 => "44000",
+            Inner::E53000 => "53000",
+            Inner::E53100 => "53100",
+            Inner::E53200 => "53200",
+            Inner::E53300 => "53300",
+            Inner::E53400 => "53400",
+            Inner::E54000 => "54000",
+            Inner::E54001 => "54001",
+            Inner::E54011 => "54011",
+            Inner::E54023 => "54023",
+            Inner::E55000 => "55000",
+            Inner::E55006 => "55006",
+            Inner::E55P02 => "55P02",
+            Inner::E55P03 => "55P03",
+            Inner::E55P04 => "55P04",
+            Inner::E57000 => "57000",
+            Inner::E57014 => "57014",
+            Inner::E57P01 => "57P01",
+            Inner::E57P02 => "57P02",
+            Inner::E57P03 => "57P03",
+            Inner::E57P04 => "57P04",
+            Inner::E57P05 => "57P05",
+            Inner::E58000 => "58000",
+            Inner::E58030 => "58030",
+            Inner::E58P01 => "58P01",
+            Inner::E58P02 => "58P02",
+            Inner::E72000 => "72000",
+            Inner::EF0000 => "F0000",
+            Inner::EF0001 => "F0001",
+            Inner::EHV000 => "HV000",
+            Inner::EHV005 => "HV005",
+            Inner::EHV002 => "HV002",
+            Inner::EHV010 => "HV010",
+            Inner::EHV021 => "HV021",
+            Inner::EHV024 => "HV024",
+            Inner::EHV007 => "HV007",
+            Inner::EHV008 => "HV008",
+            Inner::EHV004 => "HV004",
+            Inner::EHV006 => "HV006",
+            Inner::EHV091 => "HV091",
+            Inner::EHV00B => "HV00B",
+            Inner::EHV00C => "HV00C",
+            Inner::EHV00D => "HV00D",
+            Inner::EHV090 => "HV090",
+            Inner::EHV00A => "HV00A",
+            Inner::EHV009 => "HV009",
+            Inner::EHV014 => "HV014",
+            Inner::EHV001 => "HV001",
+            Inner::EHV00P => "HV00P",
+            Inner::EHV00J => "HV00J",
+            Inner::EHV00K => "HV00K",
+            Inner::EHV00Q => "HV00Q",
+            Inner::EHV00R => "HV00R",
+            Inner::EHV00L => "HV00L",
+            Inner::EHV00M => "HV00M",
+            Inner::EHV00N => "HV00N",
+            Inner::EP0000 => "P0000",
+            Inner::EP0001 => "P0001",
+            Inner::EP0002 => "P0002",
+            Inner::EP0003 => "P0003",
+            Inner::EP0004 => "P0004",
+            Inner::EXX000 => "XX000",
+            Inner::EXX001 => "XX001",
+            Inner::EXX002 => "XX002",
+            Inner::Other(code) => code,
+        }
+    }
+
+    /// 00000
+    pub const SUCCESSFUL_COMPLETION: SqlState = SqlState(Inner::E00000);
+
+    /// 01000
+    pub const WARNING: SqlState = SqlState(Inner::E01000);
+
+    /// 0100C
+    pub const WARNING_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E0100C);
+
+    /// 01008
+    pub const WARNING_IMPLICIT_ZERO_BIT_PADDING: SqlState = SqlState(Inner::E01008);
+
+    /// 01003
+    pub const WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION: SqlState = SqlState(Inner::E01003);
+
+    /// 01007
+    pub const WARNING_PRIVILEGE_NOT_GRANTED: SqlState = SqlState(Inner::E01007);
+
+    /// 01006
+    pub const WARNING_PRIVILEGE_NOT_REVOKED: SqlState = SqlState(Inner::E01006);
+
+    /// 01004
+    pub const WARNING_STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E01004);
+
+    /// 01P01
+    pub const WARNING_DEPRECATED_FEATURE: SqlState = SqlState(Inner::E01P01);
+
+    /// 02000
+    pub const NO_DATA: SqlState = SqlState(Inner::E02000);
+
+    /// 02001
+    pub const NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E02001);
+
+    /// 03000
+    pub const SQL_STATEMENT_NOT_YET_COMPLETE: SqlState = SqlState(Inner::E03000);
+
+    /// 08000
+    pub const CONNECTION_EXCEPTION: SqlState = SqlState(Inner::E08000);
+
+    /// 08003
+    pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(Inner::E08003);
+
+    /// 08006
+    pub const CONNECTION_FAILURE: SqlState = SqlState(Inner::E08006);
+
+    /// 08001
+    pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(Inner::E08001);
+
+    /// 08004
+    pub const SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION: SqlState = SqlState(Inner::E08004);
+
+    /// 08007
+    pub const TRANSACTION_RESOLUTION_UNKNOWN: SqlState = SqlState(Inner::E08007);
+
+    /// 08P01
+    pub const PROTOCOL_VIOLATION: SqlState = SqlState(Inner::E08P01);
+
+    /// 09000
+    pub const TRIGGERED_ACTION_EXCEPTION: SqlState = SqlState(Inner::E09000);
+
+    /// 0A000
+    pub const FEATURE_NOT_SUPPORTED: SqlState = SqlState(Inner::E0A000);
+
+    /// 0B000
+    pub const INVALID_TRANSACTION_INITIATION: SqlState = SqlState(Inner::E0B000);
+
+    /// 0F000
+    pub const LOCATOR_EXCEPTION: SqlState = SqlState(Inner::E0F000);
+
+    /// 0F001
+    pub const L_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E0F001);
+
+    /// 0L000
+    pub const INVALID_GRANTOR: SqlState = SqlState(Inner::E0L000);
+
+    /// 0LP01
+    pub const INVALID_GRANT_OPERATION: SqlState = SqlState(Inner::E0LP01);
+
+    /// 0P000
+    pub const INVALID_ROLE_SPECIFICATION: SqlState = SqlState(Inner::E0P000);
+
+    /// 0Z000
+    pub const DIAGNOSTICS_EXCEPTION: SqlState = SqlState(Inner::E0Z000);
+
+    /// 0Z002
+    pub const STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER: SqlState =
+        SqlState(Inner::E0Z002);
+
+    /// 20000
+    pub const CASE_NOT_FOUND: SqlState = SqlState(Inner::E20000);
+
+    /// 21000
+    pub const CARDINALITY_VIOLATION: SqlState = SqlState(Inner::E21000);
+
+    /// 22000
+    pub const DATA_EXCEPTION: SqlState = SqlState(Inner::E22000);
+
+    /// 2202E
+    pub const ARRAY_ELEMENT_ERROR: SqlState = SqlState(Inner::E2202E);
+
+    /// 2202E
+    pub const ARRAY_SUBSCRIPT_ERROR: SqlState = SqlState(Inner::E2202E);
+
+    /// 22021
+    pub const CHARACTER_NOT_IN_REPERTOIRE: SqlState = SqlState(Inner::E22021);
+
+    /// 22008
+    pub const DATETIME_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22008);
+
+    /// 22008
+    pub const DATETIME_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22008);
+
+    /// 22012
+    pub const DIVISION_BY_ZERO: SqlState = SqlState(Inner::E22012);
+
+    /// 22005
+    pub const ERROR_IN_ASSIGNMENT: SqlState = SqlState(Inner::E22005);
+
+    /// 2200B
+    pub const ESCAPE_CHARACTER_CONFLICT: SqlState = SqlState(Inner::E2200B);
+
+    /// 22022
+    pub const INDICATOR_OVERFLOW: SqlState = SqlState(Inner::E22022);
+
+    /// 22015
+    pub const INTERVAL_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22015);
+
+    /// 2201E
+    pub const INVALID_ARGUMENT_FOR_LOG: SqlState = SqlState(Inner::E2201E);
+
+    /// 22014
+    pub const INVALID_ARGUMENT_FOR_NTILE: SqlState = SqlState(Inner::E22014);
+
+    /// 22016
+    pub const INVALID_ARGUMENT_FOR_NTH_VALUE: SqlState = SqlState(Inner::E22016);
+
+    /// 2201F
+    pub const INVALID_ARGUMENT_FOR_POWER_FUNCTION: SqlState = SqlState(Inner::E2201F);
+
+    /// 2201G
+    pub const INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION: SqlState = SqlState(Inner::E2201G);
+
+    /// 22018
+    pub const INVALID_CHARACTER_VALUE_FOR_CAST: SqlState = SqlState(Inner::E22018);
+
+    /// 22007
+    pub const INVALID_DATETIME_FORMAT: SqlState = SqlState(Inner::E22007);
+
+    /// 22019
+    pub const INVALID_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22019);
+
+    /// 2200D
+    pub const INVALID_ESCAPE_OCTET: SqlState = SqlState(Inner::E2200D);
+
+    /// 22025
+    pub const INVALID_ESCAPE_SEQUENCE: SqlState = SqlState(Inner::E22025);
+
+    /// 22P06
+    pub const NONSTANDARD_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22P06);
+
+    /// 22010
+    pub const INVALID_INDICATOR_PARAMETER_VALUE: SqlState = SqlState(Inner::E22010);
+
+    /// 22023
+    pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(Inner::E22023);
+
+    /// 22013
+    pub const INVALID_PRECEDING_OR_FOLLOWING_SIZE: SqlState = SqlState(Inner::E22013);
+
+    /// 2201B
+    pub const INVALID_REGULAR_EXPRESSION: SqlState = SqlState(Inner::E2201B);
+
+    /// 2201W
+    pub const INVALID_ROW_COUNT_IN_LIMIT_CLAUSE: SqlState = SqlState(Inner::E2201W);
+
+    /// 2201X
+    pub const INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE: SqlState = SqlState(Inner::E2201X);
+
+    /// 2202H
+    pub const INVALID_TABLESAMPLE_ARGUMENT: SqlState = SqlState(Inner::E2202H);
+
+    /// 2202G
+    pub const INVALID_TABLESAMPLE_REPEAT: SqlState = SqlState(Inner::E2202G);
+
+    /// 22009
+    pub const INVALID_TIME_ZONE_DISPLACEMENT_VALUE: SqlState = SqlState(Inner::E22009);
+
+    /// 2200C
+    pub const INVALID_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E2200C);
+
+    /// 2200G
+    pub const MOST_SPECIFIC_TYPE_MISMATCH: SqlState = SqlState(Inner::E2200G);
+
+    /// 22004
+    pub const NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E22004);
+
+    /// 22002
+    pub const NULL_VALUE_NO_INDICATOR_PARAMETER: SqlState = SqlState(Inner::E22002);
+
+    /// 22003
+    pub const NUMERIC_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22003);
+
+    /// 2200H
+    pub const SEQUENCE_GENERATOR_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E2200H);
+
+    /// 22026
+    pub const STRING_DATA_LENGTH_MISMATCH: SqlState = SqlState(Inner::E22026);
+
+    /// 22001
+    pub const STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E22001);
+
+    /// 22011
+    pub const SUBSTRING_ERROR: SqlState = SqlState(Inner::E22011);
+
+    /// 22027
+    pub const TRIM_ERROR: SqlState = SqlState(Inner::E22027);
+
+    /// 22024
+    pub const UNTERMINATED_C_STRING: SqlState = SqlState(Inner::E22024);
+
+    /// 2200F
+    pub const ZERO_LENGTH_CHARACTER_STRING: SqlState = SqlState(Inner::E2200F);
+
+    /// 22P01
+    pub const FLOATING_POINT_EXCEPTION: SqlState = SqlState(Inner::E22P01);
+
+    /// 22P02
+    pub const INVALID_TEXT_REPRESENTATION: SqlState = SqlState(Inner::E22P02);
+
+    /// 22P03
+    pub const INVALID_BINARY_REPRESENTATION: SqlState = SqlState(Inner::E22P03);
+
+    /// 22P04
+    pub const BAD_COPY_FILE_FORMAT: SqlState = SqlState(Inner::E22P04);
+
+    /// 22P05
+    pub const UNTRANSLATABLE_CHARACTER: SqlState = SqlState(Inner::E22P05);
+
+    /// 2200L
+    pub const NOT_AN_XML_DOCUMENT: SqlState = SqlState(Inner::E2200L);
+
+    /// 2200M
+    pub const INVALID_XML_DOCUMENT: SqlState = SqlState(Inner::E2200M);
+
+    /// 2200N
+    pub const INVALID_XML_CONTENT: SqlState = SqlState(Inner::E2200N);
+
+    /// 2200S
+    pub const INVALID_XML_COMMENT: SqlState = SqlState(Inner::E2200S);
+
+    /// 2200T
+    pub const INVALID_XML_PROCESSING_INSTRUCTION: SqlState = SqlState(Inner::E2200T);
+
+    /// 22030
+    pub const DUPLICATE_JSON_OBJECT_KEY_VALUE: SqlState = SqlState(Inner::E22030);
+
+    /// 22031
+    pub const INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION: SqlState = SqlState(Inner::E22031);
+
+    /// 22032
+    pub const INVALID_JSON_TEXT: SqlState = SqlState(Inner::E22032);
+
+    /// 22033
+    pub const INVALID_SQL_JSON_SUBSCRIPT: SqlState = SqlState(Inner::E22033);
+
+    /// 22034
+    pub const MORE_THAN_ONE_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22034);
+
+    /// 22035
+    pub const NO_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22035);
+
+    /// 22036
+    pub const NON_NUMERIC_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22036);
+
+    /// 22037
+    pub const NON_UNIQUE_KEYS_IN_A_JSON_OBJECT: SqlState = SqlState(Inner::E22037);
+
+    /// 22038
+    pub const SINGLETON_SQL_JSON_ITEM_REQUIRED: SqlState = SqlState(Inner::E22038);
+
+    /// 22039
+    pub const SQL_JSON_ARRAY_NOT_FOUND: SqlState = SqlState(Inner::E22039);
+
+    /// 2203A
+    pub const SQL_JSON_MEMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203A);
+
+    /// 2203B
+    pub const SQL_JSON_NUMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203B);
+
+    /// 2203C
+    pub const SQL_JSON_OBJECT_NOT_FOUND: SqlState = SqlState(Inner::E2203C);
+
+    /// 2203D
+    pub const TOO_MANY_JSON_ARRAY_ELEMENTS: SqlState = SqlState(Inner::E2203D);
+
+    /// 2203E
+    pub const TOO_MANY_JSON_OBJECT_MEMBERS: SqlState = SqlState(Inner::E2203E);
+
+    /// 2203F
+    pub const SQL_JSON_SCALAR_REQUIRED: SqlState = SqlState(Inner::E2203F);
+
+    /// 2203G
+    pub const SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE: SqlState = SqlState(Inner::E2203G);
+
+    /// 23000
+    pub const INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E23000);
+
+    /// 23001
+    pub const RESTRICT_VIOLATION: SqlState = SqlState(Inner::E23001);
+
+    /// 23502
+    pub const NOT_NULL_VIOLATION: SqlState = SqlState(Inner::E23502);
+
+    /// 23503
+    pub const FOREIGN_KEY_VIOLATION: SqlState = SqlState(Inner::E23503);
+
+    /// 23505
+    pub const UNIQUE_VIOLATION: SqlState = SqlState(Inner::E23505);
+
+    /// 23514
+    pub const CHECK_VIOLATION: SqlState = SqlState(Inner::E23514);
+
+    /// 23P01
+    pub const EXCLUSION_VIOLATION: SqlState = SqlState(Inner::E23P01);
+
+    /// 24000
+    pub const INVALID_CURSOR_STATE: SqlState = SqlState(Inner::E24000);
+
+    /// 25000
+    pub const INVALID_TRANSACTION_STATE: SqlState = SqlState(Inner::E25000);
+
+    /// 25001
+    pub const ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25001);
+
+    /// 25002
+    pub const BRANCH_TRANSACTION_ALREADY_ACTIVE: SqlState = SqlState(Inner::E25002);
+
+    /// 25008
+    pub const HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL: SqlState = SqlState(Inner::E25008);
+
+    /// 25003
+    pub const INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25003);
+
+    /// 25004
+    pub const INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION: SqlState =
+        SqlState(Inner::E25004);
+
+    /// 25005
+    pub const NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25005);
+
+    /// 25006
+    pub const READ_ONLY_SQL_TRANSACTION: SqlState = SqlState(Inner::E25006);
+
+    /// 25007
+    pub const SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED: SqlState = SqlState(Inner::E25007);
+
+    /// 25P01
+    pub const NO_ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P01);
+
+    /// 25P02
+    pub const IN_FAILED_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P02);
+
+    /// 25P03
+    pub const IDLE_IN_TRANSACTION_SESSION_TIMEOUT: SqlState = SqlState(Inner::E25P03);
+
+    /// 26000
+    pub const INVALID_SQL_STATEMENT_NAME: SqlState = SqlState(Inner::E26000);
+
+    /// 26000
+    pub const UNDEFINED_PSTATEMENT: SqlState = SqlState(Inner::E26000);
+
+    /// 27000
+    pub const TRIGGERED_DATA_CHANGE_VIOLATION: SqlState = SqlState(Inner::E27000);
+
+    /// 28000
+    pub const INVALID_AUTHORIZATION_SPECIFICATION: SqlState = SqlState(Inner::E28000);
+
+    /// 28P01
+    pub const INVALID_PASSWORD: SqlState = SqlState(Inner::E28P01);
+
+    /// 2B000
+    pub const DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST: SqlState = SqlState(Inner::E2B000);
+
+    /// 2BP01
+    pub const DEPENDENT_OBJECTS_STILL_EXIST: SqlState = SqlState(Inner::E2BP01);
+
+    /// 2D000
+    pub const INVALID_TRANSACTION_TERMINATION: SqlState = SqlState(Inner::E2D000);
+
+    /// 2F000
+    pub const SQL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E2F000);
+
+    /// 2F005
+    pub const S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT: SqlState = SqlState(Inner::E2F005);
+
+    /// 2F002
+    pub const S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F002);
+
+    /// 2F003
+    pub const S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E2F003);
+
+    /// 2F004
+    pub const S_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F004);
+
+    /// 34000
+    pub const INVALID_CURSOR_NAME: SqlState = SqlState(Inner::E34000);
+
+    /// 34000
+    pub const UNDEFINED_CURSOR: SqlState = SqlState(Inner::E34000);
+
+    /// 38000
+    pub const EXTERNAL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E38000);
+
+    /// 38001
+    pub const E_R_E_CONTAINING_SQL_NOT_PERMITTED: SqlState = SqlState(Inner::E38001);
+
+    /// 38002
+    pub const E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38002);
+
+    /// 38003
+    pub const E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E38003);
+
+    /// 38004
+    pub const E_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38004);
+
+    /// 39000
+    pub const EXTERNAL_ROUTINE_INVOCATION_EXCEPTION: SqlState = SqlState(Inner::E39000);
+
+    /// 39001
+    pub const E_R_I_E_INVALID_SQLSTATE_RETURNED: SqlState = SqlState(Inner::E39001);
+
+    /// 39004
+    pub const E_R_I_E_NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E39004);
+
+    /// 39P01
+    pub const E_R_I_E_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P01);
+
+    /// 39P02
+    pub const E_R_I_E_SRF_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P02);
+
+    /// 39P03
+    pub const E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P03);
+
+    /// 3B000
+    pub const SAVEPOINT_EXCEPTION: SqlState = SqlState(Inner::E3B000);
+
+    /// 3B001
+    pub const S_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E3B001);
+
+    /// 3D000
+    pub const INVALID_CATALOG_NAME: SqlState = SqlState(Inner::E3D000);
+
+    /// 3D000
+    pub const UNDEFINED_DATABASE: SqlState = SqlState(Inner::E3D000);
+
+    /// 3F000
+    pub const INVALID_SCHEMA_NAME: SqlState = SqlState(Inner::E3F000);
+
+    /// 3F000
+    pub const UNDEFINED_SCHEMA: SqlState = SqlState(Inner::E3F000);
+
+    /// 40000
+    pub const TRANSACTION_ROLLBACK: SqlState = SqlState(Inner::E40000);
+
+    /// 40002
+    pub const T_R_INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E40002);
+
+    /// 40001
+    pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(Inner::E40001);
+
+    /// 40003
+    pub const T_R_STATEMENT_COMPLETION_UNKNOWN: SqlState = SqlState(Inner::E40003);
+
+    /// 40P01
+    pub const T_R_DEADLOCK_DETECTED: SqlState = SqlState(Inner::E40P01);
+
+    /// 42000
+    pub const SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION: SqlState = SqlState(Inner::E42000);
+
+    /// 42601
+    pub const SYNTAX_ERROR: SqlState = SqlState(Inner::E42601);
+
+    /// 42501
+    pub const INSUFFICIENT_PRIVILEGE: SqlState = SqlState(Inner::E42501);
+
+    /// 42846
+    pub const CANNOT_COERCE: SqlState = SqlState(Inner::E42846);
+
+    /// 42803
+    pub const GROUPING_ERROR: SqlState = SqlState(Inner::E42803);
+
+    /// 42P20
+    pub const WINDOWING_ERROR: SqlState = SqlState(Inner::E42P20);
+
+    /// 42P19
+    pub const INVALID_RECURSION: SqlState = SqlState(Inner::E42P19);
+
+    /// 42830
+    pub const INVALID_FOREIGN_KEY: SqlState = SqlState(Inner::E42830);
+
+    /// 42602
+    pub const INVALID_NAME: SqlState = SqlState(Inner::E42602);
+
+    /// 42622
+    pub const NAME_TOO_LONG: SqlState = SqlState(Inner::E42622);
+
+    /// 42939
+    pub const RESERVED_NAME: SqlState = SqlState(Inner::E42939);
+
+    /// 42804
+    pub const DATATYPE_MISMATCH: SqlState = SqlState(Inner::E42804);
+
+    /// 42P18
+    pub const INDETERMINATE_DATATYPE: SqlState = SqlState(Inner::E42P18);
+
+    /// 42P21
+    pub const COLLATION_MISMATCH: SqlState = SqlState(Inner::E42P21);
+
+    /// 42P22
+    pub const INDETERMINATE_COLLATION: SqlState = SqlState(Inner::E42P22);
+
+    /// 42809
+    pub const WRONG_OBJECT_TYPE: SqlState = SqlState(Inner::E42809);
+
+    /// 428C9
+    pub const GENERATED_ALWAYS: SqlState = SqlState(Inner::E428C9);
+
+    /// 42703
+    pub const UNDEFINED_COLUMN: SqlState = SqlState(Inner::E42703);
+
+    /// 42883
+    pub const UNDEFINED_FUNCTION: SqlState = SqlState(Inner::E42883);
+
+    /// 42P01
+    pub const UNDEFINED_TABLE: SqlState = SqlState(Inner::E42P01);
+
+    /// 42P02
+    pub const UNDEFINED_PARAMETER: SqlState = SqlState(Inner::E42P02);
+
+    /// 42704
+    pub const UNDEFINED_OBJECT: SqlState = SqlState(Inner::E42704);
+
+    /// 42701
+    pub const DUPLICATE_COLUMN: SqlState = SqlState(Inner::E42701);
+
+    /// 42P03
+    pub const DUPLICATE_CURSOR: SqlState = SqlState(Inner::E42P03);
+
+    /// 42P04
+    pub const DUPLICATE_DATABASE: SqlState = SqlState(Inner::E42P04);
+
+    /// 42723
+    pub const DUPLICATE_FUNCTION: SqlState = SqlState(Inner::E42723);
+
+    /// 42P05
+    pub const DUPLICATE_PSTATEMENT: SqlState = SqlState(Inner::E42P05);
+
+    /// 42P06
+    pub const DUPLICATE_SCHEMA: SqlState = SqlState(Inner::E42P06);
+
+    /// 42P07
+    pub const DUPLICATE_TABLE: SqlState = SqlState(Inner::E42P07);
+
+    /// 42712
+    pub const DUPLICATE_ALIAS: SqlState = SqlState(Inner::E42712);
+
+    /// 42710
+    pub const DUPLICATE_OBJECT: SqlState = SqlState(Inner::E42710);
+
+    /// 42702
+    pub const AMBIGUOUS_COLUMN: SqlState = SqlState(Inner::E42702);
+
+    /// 42725
+    pub const AMBIGUOUS_FUNCTION: SqlState = SqlState(Inner::E42725);
+
+    /// 42P08
+    pub const AMBIGUOUS_PARAMETER: SqlState = SqlState(Inner::E42P08);
+
+    /// 42P09
+    pub const AMBIGUOUS_ALIAS: SqlState = SqlState(Inner::E42P09);
+
+    /// 42P10
+    pub const INVALID_COLUMN_REFERENCE: SqlState = SqlState(Inner::E42P10);
+
+    /// 42611
+    pub const INVALID_COLUMN_DEFINITION: SqlState = SqlState(Inner::E42611);
+
+    /// 42P11
+    pub const INVALID_CURSOR_DEFINITION: SqlState = SqlState(Inner::E42P11);
+
+    /// 42P12
+    pub const INVALID_DATABASE_DEFINITION: SqlState = SqlState(Inner::E42P12);
+
+    /// 42P13
+    pub const INVALID_FUNCTION_DEFINITION: SqlState = SqlState(Inner::E42P13);
+
+    /// 42P14
+    pub const INVALID_PSTATEMENT_DEFINITION: SqlState = SqlState(Inner::E42P14);
+
+    /// 42P15
+    pub const INVALID_SCHEMA_DEFINITION: SqlState = SqlState(Inner::E42P15);
+
+    /// 42P16
+    pub const INVALID_TABLE_DEFINITION: SqlState = SqlState(Inner::E42P16);
+
+    /// 42P17
+    pub const INVALID_OBJECT_DEFINITION: SqlState = SqlState(Inner::E42P17);
+
+    /// 44000
+    pub const WITH_CHECK_OPTION_VIOLATION: SqlState = SqlState(Inner::E44000);
+
+    /// 53000
+    pub const INSUFFICIENT_RESOURCES: SqlState = SqlState(Inner::E53000);
+
+    /// 53100
+    pub const DISK_FULL: SqlState = SqlState(Inner::E53100);
+
+    /// 53200
+    pub const OUT_OF_MEMORY: SqlState = SqlState(Inner::E53200);
+
+    /// 53300
+    pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(Inner::E53300);
+
+    /// 53400
+    pub const CONFIGURATION_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E53400);
+
+    /// 54000
+    pub const PROGRAM_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E54000);
+
+    /// 54001
+    pub const STATEMENT_TOO_COMPLEX: SqlState = SqlState(Inner::E54001);
+
+    /// 54011
+    pub const TOO_MANY_COLUMNS: SqlState = SqlState(Inner::E54011);
+
+    /// 54023
+    pub const TOO_MANY_ARGUMENTS: SqlState = SqlState(Inner::E54023);
+
+    /// 55000
+    pub const OBJECT_NOT_IN_PREREQUISITE_STATE: SqlState = SqlState(Inner::E55000);
+
+    /// 55006
+    pub const OBJECT_IN_USE: SqlState = SqlState(Inner::E55006);
+
+    /// 55P02
+    pub const CANT_CHANGE_RUNTIME_PARAM: SqlState = SqlState(Inner::E55P02);
+
+    /// 55P03
+    pub const LOCK_NOT_AVAILABLE: SqlState = SqlState(Inner::E55P03);
+
+    /// 55P04
+    pub const UNSAFE_NEW_ENUM_VALUE_USAGE: SqlState = SqlState(Inner::E55P04);
+
+    /// 57000
+    pub const OPERATOR_INTERVENTION: SqlState = SqlState(Inner::E57000);
+
+    /// 57014
+    pub const QUERY_CANCELED: SqlState = SqlState(Inner::E57014);
+
+    /// 57P01
+    pub const ADMIN_SHUTDOWN: SqlState = SqlState(Inner::E57P01);
+
+    /// 57P02
+    pub const CRASH_SHUTDOWN: SqlState = SqlState(Inner::E57P02);
+
+    /// 57P03
+    pub const CANNOT_CONNECT_NOW: SqlState = SqlState(Inner::E57P03);
+
+    /// 57P04
+    pub const DATABASE_DROPPED: SqlState = SqlState(Inner::E57P04);
+
+    /// 57P05
+    pub const IDLE_SESSION_TIMEOUT: SqlState = SqlState(Inner::E57P05);
+
+    /// 58000
+    pub const SYSTEM_ERROR: SqlState = SqlState(Inner::E58000);
+
+    /// 58030
+    pub const IO_ERROR: SqlState = SqlState(Inner::E58030);
+
+    /// 58P01
+    pub const UNDEFINED_FILE: SqlState = SqlState(Inner::E58P01);
+
+    /// 58P02
+    pub const DUPLICATE_FILE: SqlState = SqlState(Inner::E58P02);
+
+    /// 72000
+    pub const SNAPSHOT_TOO_OLD: SqlState = SqlState(Inner::E72000);
+
+    /// F0000
+    pub const CONFIG_FILE_ERROR: SqlState = SqlState(Inner::EF0000);
+
+    /// F0001
+    pub const LOCK_FILE_EXISTS: SqlState = SqlState(Inner::EF0001);
+
+    /// HV000
+    pub const FDW_ERROR: SqlState = SqlState(Inner::EHV000);
+
+    /// HV005
+    pub const FDW_COLUMN_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV005);
+
+    /// HV002
+    pub const FDW_DYNAMIC_PARAMETER_VALUE_NEEDED: SqlState = SqlState(Inner::EHV002);
+
+    /// HV010
+    pub const FDW_FUNCTION_SEQUENCE_ERROR: SqlState = SqlState(Inner::EHV010);
+
+    /// HV021
+    pub const FDW_INCONSISTENT_DESCRIPTOR_INFORMATION: SqlState = SqlState(Inner::EHV021);
+
+    /// HV024
+    pub const FDW_INVALID_ATTRIBUTE_VALUE: SqlState = SqlState(Inner::EHV024);
+
+    /// HV007
+    pub const FDW_INVALID_COLUMN_NAME: SqlState = SqlState(Inner::EHV007);
+
+    /// HV008
+    pub const FDW_INVALID_COLUMN_NUMBER: SqlState = SqlState(Inner::EHV008);
+
+    /// HV004
+    pub const FDW_INVALID_DATA_TYPE: SqlState = SqlState(Inner::EHV004);
+
+    /// HV006
+    pub const FDW_INVALID_DATA_TYPE_DESCRIPTORS: SqlState = SqlState(Inner::EHV006);
+
+    /// HV091
+    pub const FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER: SqlState = SqlState(Inner::EHV091);
+
+    /// HV00B
+    pub const FDW_INVALID_HANDLE: SqlState = SqlState(Inner::EHV00B);
+
+    /// HV00C
+    pub const FDW_INVALID_OPTION_INDEX: SqlState = SqlState(Inner::EHV00C);
+
+    /// HV00D
+    pub const FDW_INVALID_OPTION_NAME: SqlState = SqlState(Inner::EHV00D);
+
+    /// HV090
+    pub const FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH: SqlState = SqlState(Inner::EHV090);
+
+    /// HV00A
+    pub const FDW_INVALID_STRING_FORMAT: SqlState = SqlState(Inner::EHV00A);
+
+    /// HV009
+    pub const FDW_INVALID_USE_OF_NULL_POINTER: SqlState = SqlState(Inner::EHV009);
+
+    /// HV014
+    pub const FDW_TOO_MANY_HANDLES: SqlState = SqlState(Inner::EHV014);
+
+    /// HV001
+    pub const FDW_OUT_OF_MEMORY: SqlState = SqlState(Inner::EHV001);
+
+    /// HV00P
+    pub const FDW_NO_SCHEMAS: SqlState = SqlState(Inner::EHV00P);
+
+    /// HV00J
+    pub const FDW_OPTION_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV00J);
+
+    /// HV00K
+    pub const FDW_REPLY_HANDLE: SqlState = SqlState(Inner::EHV00K);
+
+    /// HV00Q
+    pub const FDW_SCHEMA_NOT_FOUND: SqlState = SqlState(Inner::EHV00Q);
+
+    /// HV00R
+    pub const FDW_TABLE_NOT_FOUND: SqlState = SqlState(Inner::EHV00R);
+
+    /// HV00L
+    pub const FDW_UNABLE_TO_CREATE_EXECUTION: SqlState = SqlState(Inner::EHV00L);
+
+    /// HV00M
+    pub const FDW_UNABLE_TO_CREATE_REPLY: SqlState = SqlState(Inner::EHV00M);
+
+    /// HV00N
+    pub const FDW_UNABLE_TO_ESTABLISH_CONNECTION: SqlState = SqlState(Inner::EHV00N);
+
+    /// P0000
+    pub const PLPGSQL_ERROR: SqlState = SqlState(Inner::EP0000);
+
+    /// P0001
+    pub const RAISE_EXCEPTION: SqlState = SqlState(Inner::EP0001);
+
+    /// P0002
+    pub const NO_DATA_FOUND: SqlState = SqlState(Inner::EP0002);
+
+    /// P0003
+    pub const TOO_MANY_ROWS: SqlState = SqlState(Inner::EP0003);
+
+    /// P0004
+    pub const ASSERT_FAILURE: SqlState = SqlState(Inner::EP0004);
+
+    /// XX000
+    pub const INTERNAL_ERROR: SqlState = SqlState(Inner::EXX000);
+
+    /// XX001
+    pub const DATA_CORRUPTED: SqlState = SqlState(Inner::EXX001);
+
+    /// XX002
+    pub const INDEX_CORRUPTED: SqlState = SqlState(Inner::EXX002);
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+#[allow(clippy::upper_case_acronyms)]
+enum Inner {
+    E00000,
+    E01000,
+    E0100C,
+    E01008,
+    E01003,
+    E01007,
+    E01006,
+    E01004,
+    E01P01,
+    E02000,
+    E02001,
+    E03000,
+    E08000,
+    E08003,
+    E08006,
+    E08001,
+    E08004,
+    E08007,
+    E08P01,
+    E09000,
+    E0A000,
+    E0B000,
+    E0F000,
+    E0F001,
+    E0L000,
+    E0LP01,
+    E0P000,
+    E0Z000,
+    E0Z002,
+    E20000,
+    E21000,
+    E22000,
+    E2202E,
+    E22021,
+    E22008,
+    E22012,
+    E22005,
+    E2200B,
+    E22022,
+    E22015,
+    E2201E,
+    E22014,
+    E22016,
+    E2201F,
+    E2201G,
+    E22018,
+    E22007,
+    E22019,
+    E2200D,
+    E22025,
+    E22P06,
+    E22010,
+    E22023,
+    E22013,
+    E2201B,
+    E2201W,
+    E2201X,
+    E2202H,
+    E2202G,
+    E22009,
+    E2200C,
+    E2200G,
+    E22004,
+    E22002,
+    E22003,
+    E2200H,
+    E22026,
+    E22001,
+    E22011,
+    E22027,
+    E22024,
+    E2200F,
+    E22P01,
+    E22P02,
+    E22P03,
+    E22P04,
+    E22P05,
+    E2200L,
+    E2200M,
+    E2200N,
+    E2200S,
+    E2200T,
+    E22030,
+    E22031,
+    E22032,
+    E22033,
+    E22034,
+    E22035,
+    E22036,
+    E22037,
+    E22038,
+    E22039,
+    E2203A,
+    E2203B,
+    E2203C,
+    E2203D,
+    E2203E,
+    E2203F,
+    E2203G,
+    E23000,
+    E23001,
+    E23502,
+    E23503,
+    E23505,
+    E23514,
+    E23P01,
+    E24000,
+    E25000,
+    E25001,
+    E25002,
+    E25008,
+    E25003,
+    E25004,
+    E25005,
+    E25006,
+    E25007,
+    E25P01,
+    E25P02,
+    E25P03,
+    E26000,
+    E27000,
+    E28000,
+    E28P01,
+    E2B000,
+    E2BP01,
+    E2D000,
+    E2F000,
+    E2F005,
+    E2F002,
+    E2F003,
+    E2F004,
+    E34000,
+    E38000,
+    E38001,
+    E38002,
+    E38003,
+    E38004,
+    E39000,
+    E39001,
+    E39004,
+    E39P01,
+    E39P02,
+    E39P03,
+    E3B000,
+    E3B001,
+    E3D000,
+    E3F000,
+    E40000,
+    E40002,
+    E40001,
+    E40003,
+    E40P01,
+    E42000,
+    E42601,
+    E42501,
+    E42846,
+    E42803,
+    E42P20,
+    E42P19,
+    E42830,
+    E42602,
+    E42622,
+    E42939,
+    E42804,
+    E42P18,
+    E42P21,
+    E42P22,
+    E42809,
+    E428C9,
+    E42703,
+    E42883,
+    E42P01,
+    E42P02,
+    E42704,
+    E42701,
+    E42P03,
+    E42P04,
+    E42723,
+    E42P05,
+    E42P06,
+    E42P07,
+    E42712,
+    E42710,
+    E42702,
+    E42725,
+    E42P08,
+    E42P09,
+    E42P10,
+    E42611,
+    E42P11,
+    E42P12,
+    E42P13,
+    E42P14,
+    E42P15,
+    E42P16,
+    E42P17,
+    E44000,
+    E53000,
+    E53100,
+    E53200,
+    E53300,
+    E53400,
+    E54000,
+    E54001,
+    E54011,
+    E54023,
+    E55000,
+    E55006,
+    E55P02,
+    E55P03,
+    E55P04,
+    E57000,
+    E57014,
+    E57P01,
+    E57P02,
+    E57P03,
+    E57P04,
+    E57P05,
+    E58000,
+    E58030,
+    E58P01,
+    E58P02,
+    E72000,
+    EF0000,
+    EF0001,
+    EHV000,
+    EHV005,
+    EHV002,
+    EHV010,
+    EHV021,
+    EHV024,
+    EHV007,
+    EHV008,
+    EHV004,
+    EHV006,
+    EHV091,
+    EHV00B,
+    EHV00C,
+    EHV00D,
+    EHV090,
+    EHV00A,
+    EHV009,
+    EHV014,
+    EHV001,
+    EHV00P,
+    EHV00J,
+    EHV00K,
+    EHV00Q,
+    EHV00R,
+    EHV00L,
+    EHV00M,
+    EHV00N,
+    EP0000,
+    EP0001,
+    EP0002,
+    EP0003,
+    EP0004,
+    EXX000,
+    EXX001,
+    EXX002,
+    Other(Box<str>),
+}
+
+#[rustfmt::skip]
+static SQLSTATE_MAP: phf::Map<&'static str, SqlState> = 
+::phf::Map {
+    key: 12913932095322966823,
+    disps: &[
+        (0, 24),
+        (0, 12),
+        (0, 74),
+        (0, 109),
+        (0, 11),
+        (0, 9),
+        (0, 0),
+        (4, 38),
+        (3, 155),
+        (0, 6),
+        (1, 242),
+        (0, 66),
+        (0, 53),
+        (5, 180),
+        (3, 221),
+        (7, 230),
+        (0, 125),
+        (1, 46),
+        (0, 11),
+        (1, 2),
+        (0, 5),
+        (0, 13),
+        (0, 171),
+        (0, 15),
+        (0, 4),
+        (0, 22),
+        (1, 85),
+        (0, 75),
+        (2, 0),
+        (1, 25),
+        (7, 47),
+        (0, 45),
+        (0, 35),
+        (0, 7),
+        (7, 124),
+        (0, 0),
+        (14, 104),
+        (1, 183),
+        (61, 50),
+        (3, 76),
+        (0, 12),
+        (0, 7),
+        (4, 189),
+        (0, 1),
+        (64, 102),
+        (0, 0),
+        (16, 192),
+        (24, 19),
+        (0, 5),
+        (0, 87),
+        (0, 89),
+        (0, 14),
+    ],
+    entries: &[
+        ("2F000", SqlState::SQL_ROUTINE_EXCEPTION),
+        ("01008", SqlState::WARNING_IMPLICIT_ZERO_BIT_PADDING),
+        ("42501", SqlState::INSUFFICIENT_PRIVILEGE),
+        ("22000", SqlState::DATA_EXCEPTION),
+        ("0100C", SqlState::WARNING_DYNAMIC_RESULT_SETS_RETURNED),
+        ("2200N", SqlState::INVALID_XML_CONTENT),
+        ("40001", SqlState::T_R_SERIALIZATION_FAILURE),
+        ("28P01", SqlState::INVALID_PASSWORD),
+        ("38000", SqlState::EXTERNAL_ROUTINE_EXCEPTION),
+        ("25006", SqlState::READ_ONLY_SQL_TRANSACTION),
+        ("2203D", SqlState::TOO_MANY_JSON_ARRAY_ELEMENTS),
+        ("42P09", SqlState::AMBIGUOUS_ALIAS),
+        ("F0000", SqlState::CONFIG_FILE_ERROR),
+        ("42P18", SqlState::INDETERMINATE_DATATYPE),
+        ("40002", SqlState::T_R_INTEGRITY_CONSTRAINT_VIOLATION),
+        ("22009", SqlState::INVALID_TIME_ZONE_DISPLACEMENT_VALUE),
+        ("42P08", SqlState::AMBIGUOUS_PARAMETER),
+        ("08000", SqlState::CONNECTION_EXCEPTION),
+        ("25P01", SqlState::NO_ACTIVE_SQL_TRANSACTION),
+        ("22024", SqlState::UNTERMINATED_C_STRING),
+        ("55000", SqlState::OBJECT_NOT_IN_PREREQUISITE_STATE),
+        ("25001", SqlState::ACTIVE_SQL_TRANSACTION),
+        ("03000", SqlState::SQL_STATEMENT_NOT_YET_COMPLETE),
+        ("42710", SqlState::DUPLICATE_OBJECT),
+        ("2D000", SqlState::INVALID_TRANSACTION_TERMINATION),
+        ("2200G", SqlState::MOST_SPECIFIC_TYPE_MISMATCH),
+        ("22022", SqlState::INDICATOR_OVERFLOW),
+        ("55006", SqlState::OBJECT_IN_USE),
+        ("53200", SqlState::OUT_OF_MEMORY),
+        ("22012", SqlState::DIVISION_BY_ZERO),
+        ("P0002", SqlState::NO_DATA_FOUND),
+        ("XX001", SqlState::DATA_CORRUPTED),
+        ("22P05", SqlState::UNTRANSLATABLE_CHARACTER),
+        ("40003", SqlState::T_R_STATEMENT_COMPLETION_UNKNOWN),
+        ("22021", SqlState::CHARACTER_NOT_IN_REPERTOIRE),
+        ("25000", SqlState::INVALID_TRANSACTION_STATE),
+        ("42P15", SqlState::INVALID_SCHEMA_DEFINITION),
+        ("0B000", SqlState::INVALID_TRANSACTION_INITIATION),
+        ("22004", SqlState::NULL_VALUE_NOT_ALLOWED),
+        ("42804", SqlState::DATATYPE_MISMATCH),
+        ("42803", SqlState::GROUPING_ERROR),
+        ("02001", SqlState::NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED),
+        ("25002", SqlState::BRANCH_TRANSACTION_ALREADY_ACTIVE),
+        ("28000", SqlState::INVALID_AUTHORIZATION_SPECIFICATION),
+        ("HV009", SqlState::FDW_INVALID_USE_OF_NULL_POINTER),
+        ("22P01", SqlState::FLOATING_POINT_EXCEPTION),
+        ("2B000", SqlState::DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST),
+        ("42723", SqlState::DUPLICATE_FUNCTION),
+        ("21000", SqlState::CARDINALITY_VIOLATION),
+        ("0Z002", SqlState::STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER),
+        ("23505", SqlState::UNIQUE_VIOLATION),
+        ("HV00J", SqlState::FDW_OPTION_NAME_NOT_FOUND),
+        ("23P01", SqlState::EXCLUSION_VIOLATION),
+        ("39P03", SqlState::E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED),
+        ("42P10", SqlState::INVALID_COLUMN_REFERENCE),
+        ("2202H", SqlState::INVALID_TABLESAMPLE_ARGUMENT),
+        ("55P04", SqlState::UNSAFE_NEW_ENUM_VALUE_USAGE),
+        ("P0000", SqlState::PLPGSQL_ERROR),
+        ("2F005", SqlState::S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT),
+        ("HV00M", SqlState::FDW_UNABLE_TO_CREATE_REPLY),
+        ("0A000", SqlState::FEATURE_NOT_SUPPORTED),
+        ("24000", SqlState::INVALID_CURSOR_STATE),
+        ("25008", SqlState::HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL),
+        ("01003", SqlState::WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION),
+        ("42712", SqlState::DUPLICATE_ALIAS),
+        ("HV014", SqlState::FDW_TOO_MANY_HANDLES),
+        ("58030", SqlState::IO_ERROR),
+        ("2201W", SqlState::INVALID_ROW_COUNT_IN_LIMIT_CLAUSE),
+        ("22033", SqlState::INVALID_SQL_JSON_SUBSCRIPT),
+        ("2BP01", SqlState::DEPENDENT_OBJECTS_STILL_EXIST),
+        ("HV005", SqlState::FDW_COLUMN_NAME_NOT_FOUND),
+        ("25004", SqlState::INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION),
+        ("54000", SqlState::PROGRAM_LIMIT_EXCEEDED),
+        ("20000", SqlState::CASE_NOT_FOUND),
+        ("2203G", SqlState::SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE),
+        ("22038", SqlState::SINGLETON_SQL_JSON_ITEM_REQUIRED),
+        ("22007", SqlState::INVALID_DATETIME_FORMAT),
+        ("08004", SqlState::SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION),
+        ("2200H", SqlState::SEQUENCE_GENERATOR_LIMIT_EXCEEDED),
+        ("HV00D", SqlState::FDW_INVALID_OPTION_NAME),
+        ("P0004", SqlState::ASSERT_FAILURE),
+        ("22018", SqlState::INVALID_CHARACTER_VALUE_FOR_CAST),
+        ("0L000", SqlState::INVALID_GRANTOR),
+        ("22P04", SqlState::BAD_COPY_FILE_FORMAT),
+        ("22031", SqlState::INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION),
+        ("01P01", SqlState::WARNING_DEPRECATED_FEATURE),
+        ("0LP01", SqlState::INVALID_GRANT_OPERATION),
+        ("58P02", SqlState::DUPLICATE_FILE),
+        ("26000", SqlState::INVALID_SQL_STATEMENT_NAME),
+        ("54001", SqlState::STATEMENT_TOO_COMPLEX),
+        ("22010", SqlState::INVALID_INDICATOR_PARAMETER_VALUE),
+        ("HV00C", SqlState::FDW_INVALID_OPTION_INDEX),
+        ("22008", SqlState::DATETIME_FIELD_OVERFLOW),
+        ("42P06", SqlState::DUPLICATE_SCHEMA),
+        ("25007", SqlState::SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED),
+        ("42P20", SqlState::WINDOWING_ERROR),
+        ("HV091", SqlState::FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER),
+        ("HV021", SqlState::FDW_INCONSISTENT_DESCRIPTOR_INFORMATION),
+        ("42702", SqlState::AMBIGUOUS_COLUMN),
+        ("02000", SqlState::NO_DATA),
+        ("54011", SqlState::TOO_MANY_COLUMNS),
+        ("HV004", SqlState::FDW_INVALID_DATA_TYPE),
+        ("01006", SqlState::WARNING_PRIVILEGE_NOT_REVOKED),
+        ("42701", SqlState::DUPLICATE_COLUMN),
+        ("08P01", SqlState::PROTOCOL_VIOLATION),
+        ("42622", SqlState::NAME_TOO_LONG),
+        ("P0003", SqlState::TOO_MANY_ROWS),
+        ("22003", SqlState::NUMERIC_VALUE_OUT_OF_RANGE),
+        ("42P03", SqlState::DUPLICATE_CURSOR),
+        ("23001", SqlState::RESTRICT_VIOLATION),
+        ("57000", SqlState::OPERATOR_INTERVENTION),
+        ("22027", SqlState::TRIM_ERROR),
+        ("42P12", SqlState::INVALID_DATABASE_DEFINITION),
+        ("3B000", SqlState::SAVEPOINT_EXCEPTION),
+        ("2201B", SqlState::INVALID_REGULAR_EXPRESSION),
+        ("22030", SqlState::DUPLICATE_JSON_OBJECT_KEY_VALUE),
+        ("2F004", SqlState::S_R_E_READING_SQL_DATA_NOT_PERMITTED),
+        ("428C9", SqlState::GENERATED_ALWAYS),
+        ("2200S", SqlState::INVALID_XML_COMMENT),
+        ("22039", SqlState::SQL_JSON_ARRAY_NOT_FOUND),
+        ("42809", SqlState::WRONG_OBJECT_TYPE),
+        ("2201X", SqlState::INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE),
+        ("39001", SqlState::E_R_I_E_INVALID_SQLSTATE_RETURNED),
+        ("25P02", SqlState::IN_FAILED_SQL_TRANSACTION),
+        ("0P000", SqlState::INVALID_ROLE_SPECIFICATION),
+        ("HV00N", SqlState::FDW_UNABLE_TO_ESTABLISH_CONNECTION),
+        ("53100", SqlState::DISK_FULL),
+        ("42601", SqlState::SYNTAX_ERROR),
+        ("23000", SqlState::INTEGRITY_CONSTRAINT_VIOLATION),
+        ("HV006", SqlState::FDW_INVALID_DATA_TYPE_DESCRIPTORS),
+        ("HV00B", SqlState::FDW_INVALID_HANDLE),
+        ("HV00Q", SqlState::FDW_SCHEMA_NOT_FOUND),
+        ("01000", SqlState::WARNING),
+        ("42883", SqlState::UNDEFINED_FUNCTION),
+        ("57P01", SqlState::ADMIN_SHUTDOWN),
+        ("22037", SqlState::NON_UNIQUE_KEYS_IN_A_JSON_OBJECT),
+        ("00000", SqlState::SUCCESSFUL_COMPLETION),
+        ("55P03", SqlState::LOCK_NOT_AVAILABLE),
+        ("42P01", SqlState::UNDEFINED_TABLE),
+        ("42830", SqlState::INVALID_FOREIGN_KEY),
+        ("22005", SqlState::ERROR_IN_ASSIGNMENT),
+        ("22025", SqlState::INVALID_ESCAPE_SEQUENCE),
+        ("XX002", SqlState::INDEX_CORRUPTED),
+        ("42P16", SqlState::INVALID_TABLE_DEFINITION),
+        ("55P02", SqlState::CANT_CHANGE_RUNTIME_PARAM),
+        ("22019", SqlState::INVALID_ESCAPE_CHARACTER),
+        ("P0001", SqlState::RAISE_EXCEPTION),
+        ("72000", SqlState::SNAPSHOT_TOO_OLD),
+        ("42P11", SqlState::INVALID_CURSOR_DEFINITION),
+        ("40P01", SqlState::T_R_DEADLOCK_DETECTED),
+        ("57P02", SqlState::CRASH_SHUTDOWN),
+        ("HV00A", SqlState::FDW_INVALID_STRING_FORMAT),
+        ("2F002", SqlState::S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED),
+        ("23503", SqlState::FOREIGN_KEY_VIOLATION),
+        ("40000", SqlState::TRANSACTION_ROLLBACK),
+        ("22032", SqlState::INVALID_JSON_TEXT),
+        ("2202E", SqlState::ARRAY_ELEMENT_ERROR),
+        ("42P19", SqlState::INVALID_RECURSION),
+        ("42611", SqlState::INVALID_COLUMN_DEFINITION),
+        ("42P13", SqlState::INVALID_FUNCTION_DEFINITION),
+        ("25003", SqlState::INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION),
+        ("39P02", SqlState::E_R_I_E_SRF_PROTOCOL_VIOLATED),
+        ("XX000", SqlState::INTERNAL_ERROR),
+        ("08006", SqlState::CONNECTION_FAILURE),
+        ("57P04", SqlState::DATABASE_DROPPED),
+        ("42P07", SqlState::DUPLICATE_TABLE),
+        ("22P03", SqlState::INVALID_BINARY_REPRESENTATION),
+        ("22035", SqlState::NO_SQL_JSON_ITEM),
+        ("42P14", SqlState::INVALID_PSTATEMENT_DEFINITION),
+        ("01007", SqlState::WARNING_PRIVILEGE_NOT_GRANTED),
+        ("38004", SqlState::E_R_E_READING_SQL_DATA_NOT_PERMITTED),
+        ("42P21", SqlState::COLLATION_MISMATCH),
+        ("0Z000", SqlState::DIAGNOSTICS_EXCEPTION),
+        ("HV001", SqlState::FDW_OUT_OF_MEMORY),
+        ("0F000", SqlState::LOCATOR_EXCEPTION),
+        ("22013", SqlState::INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+        ("2201E", SqlState::INVALID_ARGUMENT_FOR_LOG),
+        ("22011", SqlState::SUBSTRING_ERROR),
+        ("42602", SqlState::INVALID_NAME),
+        ("01004", SqlState::WARNING_STRING_DATA_RIGHT_TRUNCATION),
+        ("42P02", SqlState::UNDEFINED_PARAMETER),
+        ("2203C", SqlState::SQL_JSON_OBJECT_NOT_FOUND),
+        ("HV002", SqlState::FDW_DYNAMIC_PARAMETER_VALUE_NEEDED),
+        ("0F001", SqlState::L_E_INVALID_SPECIFICATION),
+        ("58P01", SqlState::UNDEFINED_FILE),
+        ("38001", SqlState::E_R_E_CONTAINING_SQL_NOT_PERMITTED),
+        ("42703", SqlState::UNDEFINED_COLUMN),
+        ("57P05", SqlState::IDLE_SESSION_TIMEOUT),
+        ("57P03", SqlState::CANNOT_CONNECT_NOW),
+        ("HV007", SqlState::FDW_INVALID_COLUMN_NAME),
+        ("22014", SqlState::INVALID_ARGUMENT_FOR_NTILE),
+        ("22P06", SqlState::NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+        ("2203F", SqlState::SQL_JSON_SCALAR_REQUIRED),
+        ("2200F", SqlState::ZERO_LENGTH_CHARACTER_STRING),
+        ("09000", SqlState::TRIGGERED_ACTION_EXCEPTION),
+        ("2201F", SqlState::INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+        ("08003", SqlState::CONNECTION_DOES_NOT_EXIST),
+        ("38002", SqlState::E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED),
+        ("F0001", SqlState::LOCK_FILE_EXISTS),
+        ("42P22", SqlState::INDETERMINATE_COLLATION),
+        ("2200C", SqlState::INVALID_USE_OF_ESCAPE_CHARACTER),
+        ("2203E", SqlState::TOO_MANY_JSON_OBJECT_MEMBERS),
+        ("23514", SqlState::CHECK_VIOLATION),
+        ("22P02", SqlState::INVALID_TEXT_REPRESENTATION),
+        ("54023", SqlState::TOO_MANY_ARGUMENTS),
+        ("2200T", SqlState::INVALID_XML_PROCESSING_INSTRUCTION),
+        ("22016", SqlState::INVALID_ARGUMENT_FOR_NTH_VALUE),
+        ("25P03", SqlState::IDLE_IN_TRANSACTION_SESSION_TIMEOUT),
+        ("3B001", SqlState::S_E_INVALID_SPECIFICATION),
+        ("08001", SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+        ("22036", SqlState::NON_NUMERIC_SQL_JSON_ITEM),
+        ("3F000", SqlState::INVALID_SCHEMA_NAME),
+        ("39P01", SqlState::E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
+        ("22026", SqlState::STRING_DATA_LENGTH_MISMATCH),
+        ("42P17", SqlState::INVALID_OBJECT_DEFINITION),
+        ("22034", SqlState::MORE_THAN_ONE_SQL_JSON_ITEM),
+        ("HV000", SqlState::FDW_ERROR),
+        ("2200B", SqlState::ESCAPE_CHARACTER_CONFLICT),
+        ("HV008", SqlState::FDW_INVALID_COLUMN_NUMBER),
+        ("34000", SqlState::INVALID_CURSOR_NAME),
+        ("2201G", SqlState::INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+        ("44000", SqlState::WITH_CHECK_OPTION_VIOLATION),
+        ("HV010", SqlState::FDW_FUNCTION_SEQUENCE_ERROR),
+        ("39004", SqlState::E_R_I_E_NULL_VALUE_NOT_ALLOWED),
+        ("22001", SqlState::STRING_DATA_RIGHT_TRUNCATION),
+        ("3D000", SqlState::INVALID_CATALOG_NAME),
+        ("25005", SqlState::NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION),
+        ("2200L", SqlState::NOT_AN_XML_DOCUMENT),
+        ("27000", SqlState::TRIGGERED_DATA_CHANGE_VIOLATION),
+        ("HV090", SqlState::FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH),
+        ("42939", SqlState::RESERVED_NAME),
+        ("58000", SqlState::SYSTEM_ERROR),
+        ("2200M", SqlState::INVALID_XML_DOCUMENT),
+        ("HV00L", SqlState::FDW_UNABLE_TO_CREATE_EXECUTION),
+        ("57014", SqlState::QUERY_CANCELED),
+        ("23502", SqlState::NOT_NULL_VIOLATION),
+        ("22002", SqlState::NULL_VALUE_NO_INDICATOR_PARAMETER),
+        ("HV00R", SqlState::FDW_TABLE_NOT_FOUND),
+        ("HV00P", SqlState::FDW_NO_SCHEMAS),
+        ("38003", SqlState::E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+        ("39000", SqlState::EXTERNAL_ROUTINE_INVOCATION_EXCEPTION),
+        ("22015", SqlState::INTERVAL_FIELD_OVERFLOW),
+        ("HV00K", SqlState::FDW_REPLY_HANDLE),
+        ("HV024", SqlState::FDW_INVALID_ATTRIBUTE_VALUE),
+        ("2200D", SqlState::INVALID_ESCAPE_OCTET),
+        ("08007", SqlState::TRANSACTION_RESOLUTION_UNKNOWN),
+        ("2F003", SqlState::S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+        ("42725", SqlState::AMBIGUOUS_FUNCTION),
+        ("2203A", SqlState::SQL_JSON_MEMBER_NOT_FOUND),
+        ("42846", SqlState::CANNOT_COERCE),
+        ("42P04", SqlState::DUPLICATE_DATABASE),
+        ("42000", SqlState::SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION),
+        ("2203B", SqlState::SQL_JSON_NUMBER_NOT_FOUND),
+        ("42P05", SqlState::DUPLICATE_PSTATEMENT),
+        ("53300", SqlState::TOO_MANY_CONNECTIONS),
+        ("53400", SqlState::CONFIGURATION_LIMIT_EXCEEDED),
+        ("42704", SqlState::UNDEFINED_OBJECT),
+        ("2202G", SqlState::INVALID_TABLESAMPLE_REPEAT),
+        ("22023", SqlState::INVALID_PARAMETER_VALUE),
+        ("53000", SqlState::INSUFFICIENT_RESOURCES),
+    ],
+};
diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs
new file mode 100644
index 0000000000..768213f8ed
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -0,0 +1,64 @@
+use crate::query::RowStream;
+use crate::types::Type;
+use crate::{Client, Error, Transaction};
+use async_trait::async_trait;
+use postgres_protocol2::Oid;
+
+mod private {
+    pub trait Sealed {}
+}
+
+/// A trait allowing abstraction over connections and transactions.
+///
+/// This trait is "sealed", and cannot be implemented outside of this crate.
+#[async_trait]
+pub trait GenericClient: private::Sealed {
+    /// Like `Client::query_raw_txt`.
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send;
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
+}
+
+impl private::Sealed for Client {}
+
+#[async_trait]
+impl GenericClient for Client {
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send,
+    {
+        self.query_raw_txt(statement, params).await
+    }
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        self.get_type(oid).await
+    }
+}
+
+impl private::Sealed for Transaction<'_> {}
+
+#[async_trait]
+#[allow(clippy::needless_lifetimes)]
+impl GenericClient for Transaction<'_> {
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send,
+    {
+        self.query_raw_txt(statement, params).await
+    }
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        self.client().get_type(oid).await
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
new file mode 100644
index 0000000000..72ba8172b2
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -0,0 +1,148 @@
+//! An asynchronous, pipelined, PostgreSQL client.
+#![warn(rust_2018_idioms, clippy::all, missing_docs)]
+
+pub use crate::cancel_token::CancelToken;
+pub use crate::client::Client;
+pub use crate::config::Config;
+pub use crate::connection::Connection;
+use crate::error::DbError;
+pub use crate::error::Error;
+pub use crate::generic_client::GenericClient;
+pub use crate::query::RowStream;
+pub use crate::row::{Row, SimpleQueryRow};
+pub use crate::simple_query::SimpleQueryStream;
+pub use crate::statement::{Column, Statement};
+use crate::tls::MakeTlsConnect;
+pub use crate::tls::NoTls;
+pub use crate::to_statement::ToStatement;
+pub use crate::transaction::Transaction;
+pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
+use crate::types::ToSql;
+use postgres_protocol2::message::backend::ReadyForQueryBody;
+use tokio::net::TcpStream;
+
+/// After executing a query, the connection will be in one of these states
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[repr(u8)]
+pub enum ReadyForQueryStatus {
+    /// Connection state is unknown
+    Unknown,
+    /// Connection is idle (no transactions)
+    Idle = b'I',
+    /// Connection is in a transaction block
+    Transaction = b'T',
+    /// Connection is in a failed transaction block
+    FailedTransaction = b'E',
+}
+
+impl From<ReadyForQueryBody> for ReadyForQueryStatus {
+    fn from(value: ReadyForQueryBody) -> Self {
+        match value.status() {
+            b'I' => Self::Idle,
+            b'T' => Self::Transaction,
+            b'E' => Self::FailedTransaction,
+            _ => Self::Unknown,
+        }
+    }
+}
+
+mod cancel_query;
+mod cancel_query_raw;
+mod cancel_token;
+mod client;
+mod codec;
+pub mod config;
+mod connect;
+mod connect_raw;
+mod connect_socket;
+mod connect_tls;
+mod connection;
+pub mod error;
+mod generic_client;
+pub mod maybe_tls_stream;
+mod prepare;
+mod query;
+pub mod row;
+mod simple_query;
+mod statement;
+pub mod tls;
+mod to_statement;
+mod transaction;
+mod transaction_builder;
+pub mod types;
+
+/// A convenience function which parses a connection string and connects to the database.
+///
+/// See the documentation for [`Config`] for details on the connection string format.
+///
+/// Requires the `runtime` Cargo feature (enabled by default).
+///
+/// [`Config`]: config/struct.Config.html
+pub async fn connect<T>(
+    config: &str,
+    tls: T,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    let config = config.parse::<Config>()?;
+    config.connect(tls).await
+}
+
+/// An asynchronous notification.
+#[derive(Clone, Debug)]
+pub struct Notification {
+    process_id: i32,
+    channel: String,
+    payload: String,
+}
+
+impl Notification {
+    /// The process ID of the notifying backend process.
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    /// The name of the channel that the notify has been raised on.
+    pub fn channel(&self) -> &str {
+        &self.channel
+    }
+
+    /// The "payload" string passed from the notifying process.
+    pub fn payload(&self) -> &str {
+        &self.payload
+    }
+}
+
+/// An asynchronous message from the server.
+#[allow(clippy::large_enum_variant)]
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub enum AsyncMessage {
+    /// A notice.
+    ///
+    /// Notices use the same format as errors, but aren't "errors" per-se.
+    Notice(DbError),
+    /// A notification.
+    ///
+    /// Connections can subscribe to notifications with the `LISTEN` command.
+    Notification(Notification),
+}
+
+/// Message returned by the `SimpleQuery` stream.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum SimpleQueryMessage {
+    /// A row of data.
+    Row(SimpleQueryRow),
+    /// A statement in the query has completed.
+    ///
+    /// The number of rows modified or selected is returned.
+    CommandComplete(u64),
+}
+
+fn slice_iter<'a>(
+    s: &'a [&'a (dyn ToSql + Sync)],
+) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
+    s.iter().map(|s| *s as _)
+}
diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
new file mode 100644
index 0000000000..9a7e248997
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
@@ -0,0 +1,77 @@
+//! MaybeTlsStream.
+//!
+//! Represents a stream that may or may not be encrypted with TLS.
+use crate::tls::{ChannelBinding, TlsStream};
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+/// A stream that may or may not be encrypted with TLS.
+pub enum MaybeTlsStream<S, T> {
+    /// An unencrypted stream.
+    Raw(S),
+    /// An encrypted stream.
+    Tls(T),
+}
+
+impl<S, T> AsyncRead for MaybeTlsStream<S, T>
+where
+    S: AsyncRead + Unpin,
+    T: AsyncRead + Unpin,
+{
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_read(cx, buf),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_read(cx, buf),
+        }
+    }
+}
+
+impl<S, T> AsyncWrite for MaybeTlsStream<S, T>
+where
+    S: AsyncWrite + Unpin,
+    T: AsyncWrite + Unpin,
+{
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_write(cx, buf),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_write(cx, buf),
+        }
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_flush(cx),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_flush(cx),
+        }
+    }
+
+    fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_shutdown(cx),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_shutdown(cx),
+        }
+    }
+}
+
+impl<S, T> TlsStream for MaybeTlsStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    fn channel_binding(&self) -> ChannelBinding {
+        match self {
+            MaybeTlsStream::Raw(_) => ChannelBinding::none(),
+            MaybeTlsStream::Tls(s) => s.channel_binding(),
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs
new file mode 100644
index 0000000000..da0c755c5b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -0,0 +1,262 @@
+use crate::client::InnerClient;
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::error::SqlState;
+use crate::types::{Field, Kind, Oid, Type};
+use crate::{query, slice_iter};
+use crate::{Column, Error, Statement};
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{pin_mut, TryStreamExt};
+use log::debug;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+
+pub(crate) const TYPEINFO_QUERY: &str = "\
+SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
+FROM pg_catalog.pg_type t
+LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid
+INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
+WHERE t.oid = $1
+";
+
+// Range types weren't added until Postgres 9.2, so pg_range may not exist
+const TYPEINFO_FALLBACK_QUERY: &str = "\
+SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
+FROM pg_catalog.pg_type t
+INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
+WHERE t.oid = $1
+";
+
+const TYPEINFO_ENUM_QUERY: &str = "\
+SELECT enumlabel
+FROM pg_catalog.pg_enum
+WHERE enumtypid = $1
+ORDER BY enumsortorder
+";
+
+// Postgres 9.0 didn't have enumsortorder
+const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
+SELECT enumlabel
+FROM pg_catalog.pg_enum
+WHERE enumtypid = $1
+ORDER BY oid
+";
+
+pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
+SELECT attname, atttypid
+FROM pg_catalog.pg_attribute
+WHERE attrelid = $1
+AND NOT attisdropped
+AND attnum > 0
+ORDER BY attnum
+";
+
+static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
+
+pub async fn prepare(
+    client: &Arc<InnerClient>,
+    query: &str,
+    types: &[Type],
+) -> Result<Statement, Error> {
+    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
+    let buf = encode(client, &name, query, types)?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let parameter_description = match responses.next().await? {
+        Message::ParameterDescription(body) => body,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let mut parameters = vec![];
+    let mut it = parameter_description.parameters();
+    while let Some(oid) = it.next().map_err(Error::parse)? {
+        let type_ = get_type(client, oid).await?;
+        parameters.push(type_);
+    }
+
+    let mut columns = vec![];
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = get_type(client, field.type_oid()).await?;
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    Ok(Statement::new(client, name, parameters, columns))
+}
+
+fn prepare_rec<'a>(
+    client: &'a Arc<InnerClient>,
+    query: &'a str,
+    types: &'a [Type],
+) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
+    Box::pin(prepare(client, query, types))
+}
+
+fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
+    if types.is_empty() {
+        debug!("preparing query {}: {}", name, query);
+    } else {
+        debug!("preparing query {} with types {:?}: {}", name, types, query);
+    }
+
+    client.with_buf(|buf| {
+        frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?;
+        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
+        frontend::sync(buf);
+        Ok(buf.split().freeze())
+    })
+}
+
+pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
+    if let Some(type_) = Type::from_oid(oid) {
+        return Ok(type_);
+    }
+
+    if let Some(type_) = client.type_(oid) {
+        return Ok(type_);
+    }
+
+    let stmt = typeinfo_statement(client).await?;
+
+    let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
+    pin_mut!(rows);
+
+    let row = match rows.try_next().await? {
+        Some(row) => row,
+        None => return Err(Error::unexpected_message()),
+    };
+
+    let name: String = row.try_get(0)?;
+    let type_: i8 = row.try_get(1)?;
+    let elem_oid: Oid = row.try_get(2)?;
+    let rngsubtype: Option<Oid> = row.try_get(3)?;
+    let basetype: Oid = row.try_get(4)?;
+    let schema: String = row.try_get(5)?;
+    let relid: Oid = row.try_get(6)?;
+
+    let kind = if type_ == b'e' as i8 {
+        let variants = get_enum_variants(client, oid).await?;
+        Kind::Enum(variants)
+    } else if type_ == b'p' as i8 {
+        Kind::Pseudo
+    } else if basetype != 0 {
+        let type_ = get_type_rec(client, basetype).await?;
+        Kind::Domain(type_)
+    } else if elem_oid != 0 {
+        let type_ = get_type_rec(client, elem_oid).await?;
+        Kind::Array(type_)
+    } else if relid != 0 {
+        let fields = get_composite_fields(client, relid).await?;
+        Kind::Composite(fields)
+    } else if let Some(rngsubtype) = rngsubtype {
+        let type_ = get_type_rec(client, rngsubtype).await?;
+        Kind::Range(type_)
+    } else {
+        Kind::Simple
+    };
+
+    let type_ = Type::new(name, oid, kind, schema);
+    client.set_type(oid, &type_);
+
+    Ok(type_)
+}
+
+fn get_type_rec<'a>(
+    client: &'a Arc<InnerClient>,
+    oid: Oid,
+) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
+    Box::pin(get_type(client, oid))
+}
+
+async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo() {
+        return Ok(stmt);
+    }
+
+    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
+        Ok(stmt) => stmt,
+        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
+            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
+        }
+        Err(e) => return Err(e),
+    };
+
+    client.set_typeinfo(&stmt);
+    Ok(stmt)
+}
+
+async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
+    let stmt = typeinfo_enum_statement(client).await?;
+
+    query::query(client, stmt, slice_iter(&[&oid]))
+        .await?
+        .and_then(|row| async move { row.try_get(0) })
+        .try_collect()
+        .await
+}
+
+async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo_enum() {
+        return Ok(stmt);
+    }
+
+    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
+        Ok(stmt) => stmt,
+        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
+            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
+        }
+        Err(e) => return Err(e),
+    };
+
+    client.set_typeinfo_enum(&stmt);
+    Ok(stmt)
+}
+
+async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
+    let stmt = typeinfo_composite_statement(client).await?;
+
+    let rows = query::query(client, stmt, slice_iter(&[&oid]))
+        .await?
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    let mut fields = vec![];
+    for row in rows {
+        let name = row.try_get(0)?;
+        let oid = row.try_get(1)?;
+        let type_ = get_type_rec(client, oid).await?;
+        fields.push(Field::new(name, type_));
+    }
+
+    Ok(fields)
+}
+
+async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo_composite() {
+        return Ok(stmt);
+    }
+
+    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+
+    client.set_typeinfo_composite(&stmt);
+    Ok(stmt)
+}
diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs
new file mode 100644
index 0000000000..534195a707
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -0,0 +1,340 @@
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::IsNull;
+use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
+use bytes::{BufMut, Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Stream};
+use log::{debug, log_enabled, Level};
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use postgres_types2::{Format, ToSql, Type};
+use std::fmt;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
+
+impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list().entries(self.0.iter()).finish()
+    }
+}
+
+pub async fn query<'a, I>(
+    client: &InnerClient,
+    statement: Statement,
+    params: I,
+) -> Result<RowStream, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let buf = if log_enabled!(Level::Debug) {
+        let params = params.into_iter().collect::<Vec<_>>();
+        debug!(
+            "executing statement {} with parameters: {:?}",
+            statement.name(),
+            BorrowToSqlParamsDebug(params.as_slice()),
+        );
+        encode(client, &statement, params)?
+    } else {
+        encode(client, &statement, params)?
+    };
+    let responses = start(client, buf).await?;
+    Ok(RowStream {
+        statement,
+        responses,
+        command_tag: None,
+        status: ReadyForQueryStatus::Unknown,
+        output_format: Format::Binary,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn query_txt<S, I>(
+    client: &Arc<InnerClient>,
+    query: &str,
+    params: I,
+) -> Result<RowStream, Error>
+where
+    S: AsRef<str>,
+    I: IntoIterator<Item = Option<S>>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let params = params.into_iter();
+
+    let buf = client.with_buf(|buf| {
+        frontend::parse(
+            "",                 // unnamed prepared statement
+            query,              // query to parse
+            std::iter::empty(), // give no type info
+            buf,
+        )
+        .map_err(Error::encode)?;
+        frontend::describe(b'S', "", buf).map_err(Error::encode)?;
+        // Bind, pass params as text, retrieve as binary
+        match frontend::bind(
+            "",                 // empty string selects the unnamed portal
+            "",                 // unnamed prepared statement
+            std::iter::empty(), // all parameters use the default format (text)
+            params,
+            |param, buf| match param {
+                Some(param) => {
+                    buf.put_slice(param.as_ref().as_bytes());
+                    Ok(postgres_protocol2::IsNull::No)
+                }
+                None => Ok(postgres_protocol2::IsNull::Yes),
+            },
+            Some(0), // all text
+            buf,
+        ) {
+            Ok(()) => Ok(()),
+            Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, 0)),
+            Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
+        }?;
+
+        // Execute
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        // Sync
+        frontend::sync(buf);
+
+        Ok(buf.split().freeze())
+    })?;
+
+    // now read the responses
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let parameter_description = match responses.next().await? {
+        Message::ParameterDescription(body) => body,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let mut parameters = vec![];
+    let mut it = parameter_description.parameters();
+    while let Some(oid) = it.next().map_err(Error::parse)? {
+        let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN);
+        parameters.push(type_);
+    }
+
+    let mut columns = vec![];
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN);
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    Ok(RowStream {
+        statement: Statement::new_anonymous(parameters, columns),
+        responses,
+        command_tag: None,
+        status: ReadyForQueryStatus::Unknown,
+        output_format: Format::Text,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn execute<'a, I>(
+    client: &InnerClient,
+    statement: Statement,
+    params: I,
+) -> Result<u64, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let buf = if log_enabled!(Level::Debug) {
+        let params = params.into_iter().collect::<Vec<_>>();
+        debug!(
+            "executing statement {} with parameters: {:?}",
+            statement.name(),
+            BorrowToSqlParamsDebug(params.as_slice()),
+        );
+        encode(client, &statement, params)?
+    } else {
+        encode(client, &statement, params)?
+    };
+    let mut responses = start(client, buf).await?;
+
+    let mut rows = 0;
+    loop {
+        match responses.next().await? {
+            Message::DataRow(_) => {}
+            Message::CommandComplete(body) => {
+                rows = body
+                    .tag()
+                    .map_err(Error::parse)?
+                    .rsplit(' ')
+                    .next()
+                    .unwrap()
+                    .parse()
+                    .unwrap_or(0);
+            }
+            Message::EmptyQueryResponse => rows = 0,
+            Message::ReadyForQuery(_) => return Ok(rows),
+            _ => return Err(Error::unexpected_message()),
+        }
+    }
+}
+
+async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    Ok(responses)
+}
+
+pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    client.with_buf(|buf| {
+        encode_bind(statement, params, "", buf)?;
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        frontend::sync(buf);
+        Ok(buf.split().freeze())
+    })
+}
+
+pub fn encode_bind<'a, I>(
+    statement: &Statement,
+    params: I,
+    portal: &str,
+    buf: &mut BytesMut,
+) -> Result<(), Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let param_types = statement.params();
+    let params = params.into_iter();
+
+    assert!(
+        param_types.len() == params.len(),
+        "expected {} parameters but got {}",
+        param_types.len(),
+        params.len()
+    );
+
+    let (param_formats, params): (Vec<_>, Vec<_>) = params
+        .zip(param_types.iter())
+        .map(|(p, ty)| (p.encode_format(ty) as i16, p))
+        .unzip();
+
+    let params = params.into_iter();
+
+    let mut error_idx = 0;
+    let r = frontend::bind(
+        portal,
+        statement.name(),
+        param_formats,
+        params.zip(param_types).enumerate(),
+        |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) {
+            Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No),
+            Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes),
+            Err(e) => {
+                error_idx = idx;
+                Err(e)
+            }
+        },
+        Some(1),
+        buf,
+    );
+    match r {
+        Ok(()) => Ok(()),
+        Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)),
+        Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
+    }
+}
+
+pin_project! {
+    /// A stream of table rows.
+    pub struct RowStream {
+        statement: Statement,
+        responses: Responses,
+        command_tag: Option<String>,
+        output_format: Format,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
+}
+
+impl Stream for RowStream {
+    type Item = Result<Row, Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+        loop {
+            match ready!(this.responses.poll_next(cx)?) {
+                Message::DataRow(body) => {
+                    return Poll::Ready(Some(Ok(Row::new(
+                        this.statement.clone(),
+                        body,
+                        *this.output_format,
+                    )?)))
+                }
+                Message::EmptyQueryResponse | Message::PortalSuspended => {}
+                Message::CommandComplete(body) => {
+                    if let Ok(tag) = body.tag() {
+                        *this.command_tag = Some(tag.to_string());
+                    }
+                }
+                Message::ReadyForQuery(status) => {
+                    *this.status = status.into();
+                    return Poll::Ready(None);
+                }
+                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+            }
+        }
+    }
+}
+
+impl RowStream {
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[Column] {
+        self.statement.columns()
+    }
+
+    /// Returns the command tag of this query.
+    ///
+    /// This is only available after the stream has been exhausted.
+    pub fn command_tag(&self) -> Option<String> {
+        self.command_tag.clone()
+    }
+
+    /// Returns if the connection is ready for querying, with the status of the connection.
+    ///
+    /// This might be available only after the stream has been exhausted.
+    pub fn ready_status(&self) -> ReadyForQueryStatus {
+        self.status
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs
new file mode 100644
index 0000000000..10e130707d
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/row.rs
@@ -0,0 +1,300 @@
+//! Rows.
+
+use crate::row::sealed::{AsName, Sealed};
+use crate::simple_query::SimpleColumn;
+use crate::statement::Column;
+use crate::types::{FromSql, Type, WrongType};
+use crate::{Error, Statement};
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend::DataRowBody;
+use postgres_types2::{Format, WrongFormat};
+use std::fmt;
+use std::ops::Range;
+use std::str;
+use std::sync::Arc;
+
+mod sealed {
+    pub trait Sealed {}
+
+    pub trait AsName {
+        fn as_name(&self) -> &str;
+    }
+}
+
+impl AsName for Column {
+    fn as_name(&self) -> &str {
+        self.name()
+    }
+}
+
+impl AsName for String {
+    fn as_name(&self) -> &str {
+        self
+    }
+}
+
+/// A trait implemented by types that can index into columns of a row.
+///
+/// This cannot be implemented outside of this crate.
+pub trait RowIndex: Sealed {
+    #[doc(hidden)]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName;
+}
+
+impl Sealed for usize {}
+
+impl RowIndex for usize {
+    #[inline]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName,
+    {
+        if *self >= columns.len() {
+            None
+        } else {
+            Some(*self)
+        }
+    }
+}
+
+impl Sealed for str {}
+
+impl RowIndex for str {
+    #[inline]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName,
+    {
+        if let Some(idx) = columns.iter().position(|d| d.as_name() == self) {
+            return Some(idx);
+        };
+
+        // FIXME ASCII-only case insensitivity isn't really the right thing to
+        // do. Postgres itself uses a dubious wrapper around tolower and JDBC
+        // uses the US locale.
+        columns
+            .iter()
+            .position(|d| d.as_name().eq_ignore_ascii_case(self))
+    }
+}
+
+impl<T> Sealed for &T where T: ?Sized + Sealed {}
+
+impl<T> RowIndex for &T
+where
+    T: ?Sized + RowIndex,
+{
+    #[inline]
+    fn __idx<U>(&self, columns: &[U]) -> Option<usize>
+    where
+        U: AsName,
+    {
+        T::__idx(*self, columns)
+    }
+}
+
+/// A row of data returned from the database by a query.
+pub struct Row {
+    statement: Statement,
+    output_format: Format,
+    body: DataRowBody,
+    ranges: Vec<Option<Range<usize>>>,
+}
+
+impl fmt::Debug for Row {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Row")
+            .field("columns", &self.columns())
+            .finish()
+    }
+}
+
+impl Row {
+    pub(crate) fn new(
+        statement: Statement,
+        body: DataRowBody,
+        output_format: Format,
+    ) -> Result<Row, Error> {
+        let ranges = body.ranges().collect().map_err(Error::parse)?;
+        Ok(Row {
+            statement,
+            body,
+            ranges,
+            output_format,
+        })
+    }
+
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[Column] {
+        self.statement.columns()
+    }
+
+    /// Determines if the row contains no values.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the number of values in the row.
+    pub fn len(&self) -> usize {
+        self.columns().len()
+    }
+
+    /// Deserializes a value from the row.
+    ///
+    /// The value can be specified either by its numeric index in the row, or by its column name.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
+    pub fn get<'a, I, T>(&'a self, idx: I) -> T
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        match self.get_inner(&idx) {
+            Ok(ok) => ok,
+            Err(err) => panic!("error retrieving column {}: {}", idx, err),
+        }
+    }
+
+    /// Like `Row::get`, but returns a `Result` rather than panicking.
+    pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result<T, Error>
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        self.get_inner(&idx)
+    }
+
+    fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result<T, Error>
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        let idx = match idx.__idx(self.columns()) {
+            Some(idx) => idx,
+            None => return Err(Error::column(idx.to_string())),
+        };
+
+        let ty = self.columns()[idx].type_();
+        if !T::accepts(ty) {
+            return Err(Error::from_sql(
+                Box::new(WrongType::new::<T>(ty.clone())),
+                idx,
+            ));
+        }
+
+        FromSql::from_sql_nullable(ty, self.col_buffer(idx)).map_err(|e| Error::from_sql(e, idx))
+    }
+
+    /// Get the raw bytes for the column at the given index.
+    fn col_buffer(&self, idx: usize) -> Option<&[u8]> {
+        let range = self.ranges.get(idx)?.to_owned()?;
+        Some(&self.body.buffer()[range])
+    }
+
+    /// Interpret the column at the given index as text
+    ///
+    /// Useful when using query_raw_txt() which sets text transfer mode
+    pub fn as_text(&self, idx: usize) -> Result<Option<&str>, Error> {
+        if self.output_format == Format::Text {
+            match self.col_buffer(idx) {
+                Some(raw) => {
+                    FromSql::from_sql(&Type::TEXT, raw).map_err(|e| Error::from_sql(e, idx))
+                }
+                None => Ok(None),
+            }
+        } else {
+            Err(Error::from_sql(Box::new(WrongFormat {}), idx))
+        }
+    }
+
+    /// Row byte size
+    pub fn body_len(&self) -> usize {
+        self.body.buffer().len()
+    }
+}
+
+impl AsName for SimpleColumn {
+    fn as_name(&self) -> &str {
+        self.name()
+    }
+}
+
+/// A row of data returned from the database by a simple query.
+#[derive(Debug)]
+pub struct SimpleQueryRow {
+    columns: Arc<[SimpleColumn]>,
+    body: DataRowBody,
+    ranges: Vec<Option<Range<usize>>>,
+}
+
+impl SimpleQueryRow {
+    #[allow(clippy::new_ret_no_self)]
+    pub(crate) fn new(
+        columns: Arc<[SimpleColumn]>,
+        body: DataRowBody,
+    ) -> Result<SimpleQueryRow, Error> {
+        let ranges = body.ranges().collect().map_err(Error::parse)?;
+        Ok(SimpleQueryRow {
+            columns,
+            body,
+            ranges,
+        })
+    }
+
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[SimpleColumn] {
+        &self.columns
+    }
+
+    /// Determines if the row contains no values.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the number of values in the row.
+    pub fn len(&self) -> usize {
+        self.columns.len()
+    }
+
+    /// Returns a value from the row.
+    ///
+    /// The value can be specified either by its numeric index in the row, or by its column name.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
+    pub fn get<I>(&self, idx: I) -> Option<&str>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        match self.get_inner(&idx) {
+            Ok(ok) => ok,
+            Err(err) => panic!("error retrieving column {}: {}", idx, err),
+        }
+    }
+
+    /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking.
+    pub fn try_get<I>(&self, idx: I) -> Result<Option<&str>, Error>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        self.get_inner(&idx)
+    }
+
+    fn get_inner<I>(&self, idx: &I) -> Result<Option<&str>, Error>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        let idx = match idx.__idx(&self.columns) {
+            Some(idx) => idx,
+            None => return Err(Error::column(idx.to_string())),
+        };
+
+        let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]);
+        FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs
new file mode 100644
index 0000000000..fb2550377b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -0,0 +1,142 @@
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Stream};
+use log::debug;
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+/// Information about a column of a single query row.
+#[derive(Debug)]
+pub struct SimpleColumn {
+    name: String,
+}
+
+impl SimpleColumn {
+    pub(crate) fn new(name: String) -> SimpleColumn {
+        SimpleColumn { name }
+    }
+
+    /// Returns the name of the column.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+}
+
+pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
+    debug!("executing simple query: {}", query);
+
+    let buf = encode(client, query)?;
+    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    Ok(SimpleQueryStream {
+        responses,
+        columns: None,
+        status: ReadyForQueryStatus::Unknown,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn batch_execute(
+    client: &InnerClient,
+    query: &str,
+) -> Result<ReadyForQueryStatus, Error> {
+    debug!("executing statement batch: {}", query);
+
+    let buf = encode(client, query)?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    loop {
+        match responses.next().await? {
+            Message::ReadyForQuery(status) => return Ok(status.into()),
+            Message::CommandComplete(_)
+            | Message::EmptyQueryResponse
+            | Message::RowDescription(_)
+            | Message::DataRow(_) => {}
+            _ => return Err(Error::unexpected_message()),
+        }
+    }
+}
+
+pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
+    client.with_buf(|buf| {
+        frontend::query(query, buf).map_err(Error::encode)?;
+        Ok(buf.split().freeze())
+    })
+}
+
+pin_project! {
+    /// A stream of simple query results.
+    pub struct SimpleQueryStream {
+        responses: Responses,
+        columns: Option<Arc<[SimpleColumn]>>,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
+}
+
+impl SimpleQueryStream {
+    /// Returns if the connection is ready for querying, with the status of the connection.
+    ///
+    /// This might be available only after the stream has been exhausted.
+    pub fn ready_status(&self) -> ReadyForQueryStatus {
+        self.status
+    }
+}
+
+impl Stream for SimpleQueryStream {
+    type Item = Result<SimpleQueryMessage, Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+        loop {
+            match ready!(this.responses.poll_next(cx)?) {
+                Message::CommandComplete(body) => {
+                    let rows = body
+                        .tag()
+                        .map_err(Error::parse)?
+                        .rsplit(' ')
+                        .next()
+                        .unwrap()
+                        .parse()
+                        .unwrap_or(0);
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows))));
+                }
+                Message::EmptyQueryResponse => {
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0))));
+                }
+                Message::RowDescription(body) => {
+                    let columns = body
+                        .fields()
+                        .map(|f| Ok(SimpleColumn::new(f.name().to_string())))
+                        .collect::<Vec<_>>()
+                        .map_err(Error::parse)?
+                        .into();
+
+                    *this.columns = Some(columns);
+                }
+                Message::DataRow(body) => {
+                    let row = match &this.columns {
+                        Some(columns) => SimpleQueryRow::new(columns.clone(), body)?,
+                        None => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+                    };
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row))));
+                }
+                Message::ReadyForQuery(s) => {
+                    *this.status = s.into();
+                    return Poll::Ready(None);
+                }
+                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+            }
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs
new file mode 100644
index 0000000000..22e160fc05
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -0,0 +1,157 @@
+use crate::client::InnerClient;
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::Type;
+use postgres_protocol2::{
+    message::{backend::Field, frontend},
+    Oid,
+};
+use std::{
+    fmt,
+    sync::{Arc, Weak},
+};
+
+struct StatementInner {
+    client: Weak<InnerClient>,
+    name: String,
+    params: Vec<Type>,
+    columns: Vec<Column>,
+}
+
+impl Drop for StatementInner {
+    fn drop(&mut self) {
+        if let Some(client) = self.client.upgrade() {
+            let buf = client.with_buf(|buf| {
+                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::sync(buf);
+                buf.split().freeze()
+            });
+            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+        }
+    }
+}
+
+/// A prepared statement.
+///
+/// Prepared statements can only be used with the connection that created them.
+#[derive(Clone)]
+pub struct Statement(Arc<StatementInner>);
+
+impl Statement {
+    pub(crate) fn new(
+        inner: &Arc<InnerClient>,
+        name: String,
+        params: Vec<Type>,
+        columns: Vec<Column>,
+    ) -> Statement {
+        Statement(Arc::new(StatementInner {
+            client: Arc::downgrade(inner),
+            name,
+            params,
+            columns,
+        }))
+    }
+
+    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
+        Statement(Arc::new(StatementInner {
+            client: Weak::new(),
+            name: String::new(),
+            params,
+            columns,
+        }))
+    }
+
+    pub(crate) fn name(&self) -> &str {
+        &self.0.name
+    }
+
+    /// Returns the expected types of the statement's parameters.
+    pub fn params(&self) -> &[Type] {
+        &self.0.params
+    }
+
+    /// Returns information about the columns returned when the statement is queried.
+    pub fn columns(&self) -> &[Column] {
+        &self.0.columns
+    }
+}
+
+/// Information about a column of a query.
+pub struct Column {
+    name: String,
+    type_: Type,
+
+    // raw fields from RowDescription
+    table_oid: Oid,
+    column_id: i16,
+    format: i16,
+
+    // that better be stored in self.type_, but that is more radical refactoring
+    type_oid: Oid,
+    type_size: i16,
+    type_modifier: i32,
+}
+
+impl Column {
+    pub(crate) fn new(name: String, type_: Type, raw_field: Field<'_>) -> Column {
+        Column {
+            name,
+            type_,
+            table_oid: raw_field.table_oid(),
+            column_id: raw_field.column_id(),
+            format: raw_field.format(),
+            type_oid: raw_field.type_oid(),
+            type_size: raw_field.type_size(),
+            type_modifier: raw_field.type_modifier(),
+        }
+    }
+
+    /// Returns the name of the column.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Returns the type of the column.
+    pub fn type_(&self) -> &Type {
+        &self.type_
+    }
+
+    /// Returns the table OID of the column.
+    pub fn table_oid(&self) -> Oid {
+        self.table_oid
+    }
+
+    /// Returns the column ID of the column.
+    pub fn column_id(&self) -> i16 {
+        self.column_id
+    }
+
+    /// Returns the format of the column.
+    pub fn format(&self) -> i16 {
+        self.format
+    }
+
+    /// Returns the type OID of the column.
+    pub fn type_oid(&self) -> Oid {
+        self.type_oid
+    }
+
+    /// Returns the type size of the column.
+    pub fn type_size(&self) -> i16 {
+        self.type_size
+    }
+
+    /// Returns the type modifier of the column.
+    pub fn type_modifier(&self) -> i32 {
+        self.type_modifier
+    }
+}
+
+impl fmt::Debug for Column {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.debug_struct("Column")
+            .field("name", &self.name)
+            .field("type", &self.type_)
+            .finish()
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs
new file mode 100644
index 0000000000..dc8140719f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/tls.rs
@@ -0,0 +1,162 @@
+//! TLS support.
+
+use std::error::Error;
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::{fmt, io};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+pub(crate) mod private {
+    pub struct ForcePrivateApi;
+}
+
+/// Channel binding information returned from a TLS handshake.
+pub struct ChannelBinding {
+    pub(crate) tls_server_end_point: Option<Vec<u8>>,
+}
+
+impl ChannelBinding {
+    /// Creates a `ChannelBinding` containing no information.
+    pub fn none() -> ChannelBinding {
+        ChannelBinding {
+            tls_server_end_point: None,
+        }
+    }
+
+    /// Creates a `ChannelBinding` containing `tls-server-end-point` channel binding information.
+    pub fn tls_server_end_point(tls_server_end_point: Vec<u8>) -> ChannelBinding {
+        ChannelBinding {
+            tls_server_end_point: Some(tls_server_end_point),
+        }
+    }
+}
+
+/// A constructor of `TlsConnect`ors.
+///
+/// Requires the `runtime` Cargo feature (enabled by default).
+pub trait MakeTlsConnect<S> {
+    /// The stream type created by the `TlsConnect` implementation.
+    type Stream: TlsStream + Unpin;
+    /// The `TlsConnect` implementation created by this type.
+    type TlsConnect: TlsConnect<S, Stream = Self::Stream>;
+    /// The error type returned by the `TlsConnect` implementation.
+    type Error: Into<Box<dyn Error + Sync + Send>>;
+
+    /// Creates a new `TlsConnect`or.
+    ///
+    /// The domain name is provided for certificate verification and SNI.
+    fn make_tls_connect(&mut self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
+}
+
+/// An asynchronous function wrapping a stream in a TLS session.
+pub trait TlsConnect<S> {
+    /// The stream returned by the future.
+    type Stream: TlsStream + Unpin;
+    /// The error returned by the future.
+    type Error: Into<Box<dyn Error + Sync + Send>>;
+    /// The future returned by the connector.
+    type Future: Future<Output = Result<Self::Stream, Self::Error>>;
+
+    /// Returns a future performing a TLS handshake over the stream.
+    fn connect(self, stream: S) -> Self::Future;
+
+    #[doc(hidden)]
+    fn can_connect(&self, _: private::ForcePrivateApi) -> bool {
+        true
+    }
+}
+
+/// A TLS-wrapped connection to a PostgreSQL database.
+pub trait TlsStream: AsyncRead + AsyncWrite {
+    /// Returns channel binding information for the session.
+    fn channel_binding(&self) -> ChannelBinding;
+}
+
+/// A `MakeTlsConnect` and `TlsConnect` implementation which simply returns an error.
+///
+/// This can be used when `sslmode` is `none` or `prefer`.
+#[derive(Debug, Copy, Clone)]
+pub struct NoTls;
+
+impl<S> MakeTlsConnect<S> for NoTls {
+    type Stream = NoTlsStream;
+    type TlsConnect = NoTls;
+    type Error = NoTlsError;
+
+    fn make_tls_connect(&mut self, _: &str) -> Result<NoTls, NoTlsError> {
+        Ok(NoTls)
+    }
+}
+
+impl<S> TlsConnect<S> for NoTls {
+    type Stream = NoTlsStream;
+    type Error = NoTlsError;
+    type Future = NoTlsFuture;
+
+    fn connect(self, _: S) -> NoTlsFuture {
+        NoTlsFuture(())
+    }
+
+    fn can_connect(&self, _: private::ForcePrivateApi) -> bool {
+        false
+    }
+}
+
+/// The future returned by `NoTls`.
+pub struct NoTlsFuture(());
+
+impl Future for NoTlsFuture {
+    type Output = Result<NoTlsStream, NoTlsError>;
+
+    fn poll(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<Self::Output> {
+        Poll::Ready(Err(NoTlsError(())))
+    }
+}
+
+/// The TLS "stream" type produced by the `NoTls` connector.
+///
+/// Since `NoTls` doesn't support TLS, this type is uninhabited.
+pub enum NoTlsStream {}
+
+impl AsyncRead for NoTlsStream {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        _: &mut Context<'_>,
+        _: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+}
+
+impl AsyncWrite for NoTlsStream {
+    fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, _: &[u8]) -> Poll<io::Result<usize>> {
+        match *self {}
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+}
+
+impl TlsStream for NoTlsStream {
+    fn channel_binding(&self) -> ChannelBinding {
+        match *self {}
+    }
+}
+
+/// The error returned by `NoTls`.
+#[derive(Debug)]
+pub struct NoTlsError(());
+
+impl fmt::Display for NoTlsError {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.write_str("no TLS implementation configured")
+    }
+}
+
+impl Error for NoTlsError {}
diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs
new file mode 100644
index 0000000000..427f77dd79
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -0,0 +1,57 @@
+use crate::to_statement::private::{Sealed, ToStatementType};
+use crate::Statement;
+
+mod private {
+    use crate::{Client, Error, Statement};
+
+    pub trait Sealed {}
+
+    pub enum ToStatementType<'a> {
+        Statement(&'a Statement),
+        Query(&'a str),
+    }
+
+    impl<'a> ToStatementType<'a> {
+        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
+            match self {
+                ToStatementType::Statement(s) => Ok(s.clone()),
+                ToStatementType::Query(s) => client.prepare(s).await,
+            }
+        }
+    }
+}
+
+/// A trait abstracting over prepared and unprepared statements.
+///
+/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
+/// was prepared previously.
+///
+/// This trait is "sealed" and cannot be implemented by anything outside this crate.
+pub trait ToStatement: Sealed {
+    #[doc(hidden)]
+    fn __convert(&self) -> ToStatementType<'_>;
+}
+
+impl ToStatement for Statement {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Statement(self)
+    }
+}
+
+impl Sealed for Statement {}
+
+impl ToStatement for str {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Query(self)
+    }
+}
+
+impl Sealed for str {}
+
+impl ToStatement for String {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Query(self)
+    }
+}
+
+impl Sealed for String {}
diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs
new file mode 100644
index 0000000000..03a57e4947
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -0,0 +1,74 @@
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::query::RowStream;
+use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
+use postgres_protocol2::message::frontend;
+
+/// A representation of a PostgreSQL database transaction.
+///
+/// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the
+/// transaction. Transactions can be nested, with inner transactions implemented via safepoints.
+pub struct Transaction<'a> {
+    client: &'a mut Client,
+    done: bool,
+}
+
+impl Drop for Transaction<'_> {
+    fn drop(&mut self) {
+        if self.done {
+            return;
+        }
+
+        let buf = self.client.inner().with_buf(|buf| {
+            frontend::query("ROLLBACK", buf).unwrap();
+            buf.split().freeze()
+        });
+        let _ = self
+            .client
+            .inner()
+            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+    }
+}
+
+impl<'a> Transaction<'a> {
+    pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> {
+        Transaction {
+            client,
+            done: false,
+        }
+    }
+
+    /// Consumes the transaction, committing all changes made within it.
+    pub async fn commit(mut self) -> Result<ReadyForQueryStatus, Error> {
+        self.done = true;
+        self.client.batch_execute("COMMIT").await
+    }
+
+    /// Rolls the transaction back, discarding all changes made within it.
+    ///
+    /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller.
+    pub async fn rollback(mut self) -> Result<ReadyForQueryStatus, Error> {
+        self.done = true;
+        self.client.batch_execute("ROLLBACK").await
+    }
+
+    /// Like `Client::query_raw_txt`.
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str>,
+        I: IntoIterator<Item = Option<S>>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        self.client.query_raw_txt(statement, params).await
+    }
+
+    /// Like `Client::cancel_token`.
+    pub fn cancel_token(&self) -> CancelToken {
+        self.client.cancel_token()
+    }
+
+    /// Returns a reference to the underlying `Client`.
+    pub fn client(&self) -> &Client {
+        self.client
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/transaction_builder.rs b/libs/proxy/tokio-postgres2/src/transaction_builder.rs
new file mode 100644
index 0000000000..9718ac588c
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/transaction_builder.rs
@@ -0,0 +1,113 @@
+use crate::{Client, Error, Transaction};
+
+/// The isolation level of a database transaction.
+#[derive(Debug, Copy, Clone)]
+#[non_exhaustive]
+pub enum IsolationLevel {
+    /// Equivalent to `ReadCommitted`.
+    ReadUncommitted,
+
+    /// An individual statement in the transaction will see rows committed before it began.
+    ReadCommitted,
+
+    /// All statements in the transaction will see the same view of rows committed before the first query in the
+    /// transaction.
+    RepeatableRead,
+
+    /// The reads and writes in this transaction must be able to be committed as an atomic "unit" with respect to reads
+    /// and writes of all other concurrent serializable transactions without interleaving.
+    Serializable,
+}
+
+/// A builder for database transactions.
+pub struct TransactionBuilder<'a> {
+    client: &'a mut Client,
+    isolation_level: Option<IsolationLevel>,
+    read_only: Option<bool>,
+    deferrable: Option<bool>,
+}
+
+impl<'a> TransactionBuilder<'a> {
+    pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> {
+        TransactionBuilder {
+            client,
+            isolation_level: None,
+            read_only: None,
+            deferrable: None,
+        }
+    }
+
+    /// Sets the isolation level of the transaction.
+    pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self {
+        self.isolation_level = Some(isolation_level);
+        self
+    }
+
+    /// Sets the access mode of the transaction.
+    pub fn read_only(mut self, read_only: bool) -> Self {
+        self.read_only = Some(read_only);
+        self
+    }
+
+    /// Sets the deferrability of the transaction.
+    ///
+    /// If the transaction is also serializable and read only, creation of the transaction may block, but when it
+    /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to
+    /// serialization failure.
+    pub fn deferrable(mut self, deferrable: bool) -> Self {
+        self.deferrable = Some(deferrable);
+        self
+    }
+
+    /// Begins the transaction.
+    ///
+    /// The transaction will roll back by default - use the `commit` method to commit it.
+    pub async fn start(self) -> Result<Transaction<'a>, Error> {
+        let mut query = "START TRANSACTION".to_string();
+        let mut first = true;
+
+        if let Some(level) = self.isolation_level {
+            first = false;
+
+            query.push_str(" ISOLATION LEVEL ");
+            let level = match level {
+                IsolationLevel::ReadUncommitted => "READ UNCOMMITTED",
+                IsolationLevel::ReadCommitted => "READ COMMITTED",
+                IsolationLevel::RepeatableRead => "REPEATABLE READ",
+                IsolationLevel::Serializable => "SERIALIZABLE",
+            };
+            query.push_str(level);
+        }
+
+        if let Some(read_only) = self.read_only {
+            if !first {
+                query.push(',');
+            }
+            first = false;
+
+            let s = if read_only {
+                " READ ONLY"
+            } else {
+                " READ WRITE"
+            };
+            query.push_str(s);
+        }
+
+        if let Some(deferrable) = self.deferrable {
+            if !first {
+                query.push(',');
+            }
+
+            let s = if deferrable {
+                " DEFERRABLE"
+            } else {
+                " NOT DEFERRABLE"
+            };
+            query.push_str(s);
+        }
+
+        self.client.batch_execute(&query).await?;
+
+        Ok(Transaction::new(self.client))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/types.rs b/libs/proxy/tokio-postgres2/src/types.rs
new file mode 100644
index 0000000000..e571d7ee00
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/types.rs
@@ -0,0 +1,6 @@
+//! Types.
+//!
+//! This module is a reexport of the `postgres_types` crate.
+
+#[doc(inline)]
+pub use postgres_types2::*;
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 1665d6361a..0d774d529d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -55,6 +55,7 @@ parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
@@ -80,8 +81,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
-tokio-postgres-rustls.workspace = true
+tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -96,7 +96,6 @@ utils.workspace = true
 uuid.workspace = true
 rustls-native-certs.workspace = true
 x509-parser.workspace = true
-postgres-protocol.workspace = true
 redis.workspace = true
 zerocopy.workspace = true
 
@@ -117,6 +116,5 @@ tokio-tungstenite.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
-tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 8408d4720b..2abe88ac88 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -13,7 +13,6 @@ use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -24,6 +23,7 @@ use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
+use crate::postgres_rustls::MakeRustlsConnect;
 use crate::proxy::neon_option;
 use crate::types::Host;
 
@@ -244,7 +244,6 @@ impl ConnCfg {
             let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
             let host = match host {
                 Host::Tcp(host) => host.as_str(),
-                Host::Unix(_) => continue, // unix sockets are not welcome here
             };
 
             match connect_once(host, *port).await {
@@ -315,7 +314,7 @@ impl ConnCfg {
         };
         let client_config = client_config.with_no_client_auth();
 
-        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
             &mut mk_tls,
             host,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 5c19a23e36..4a063a5faa 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -414,6 +414,7 @@ impl RequestContextInner {
                     outcome,
                 });
         }
+
         if let Some(tx) = self.sender.take() {
             // If type changes, this error handling needs to be updated.
             let tx: mpsc::UnboundedSender<RequestData> = tx;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ad7e1d2771..ba69f9cf2d 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -88,6 +88,7 @@ pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
+pub mod postgres_rustls;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs
new file mode 100644
index 0000000000..31e7915e89
--- /dev/null
+++ b/proxy/src/postgres_rustls/mod.rs
@@ -0,0 +1,158 @@
+use std::convert::TryFrom;
+use std::sync::Arc;
+
+use rustls::pki_types::ServerName;
+use rustls::ClientConfig;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_postgres::tls::MakeTlsConnect;
+
+mod private {
+    use std::future::Future;
+    use std::io;
+    use std::pin::Pin;
+    use std::task::{Context, Poll};
+
+    use rustls::pki_types::ServerName;
+    use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+    use tokio_postgres::tls::{ChannelBinding, TlsConnect};
+    use tokio_rustls::client::TlsStream;
+    use tokio_rustls::TlsConnector;
+
+    use crate::config::TlsServerEndPoint;
+
+    pub struct TlsConnectFuture<S> {
+        inner: tokio_rustls::Connect<S>,
+    }
+
+    impl<S> Future for TlsConnectFuture<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        type Output = io::Result<RustlsStream<S>>;
+
+        fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+            Pin::new(&mut self.inner).poll(cx).map_ok(RustlsStream)
+        }
+    }
+
+    pub struct RustlsConnect(pub RustlsConnectData);
+
+    pub struct RustlsConnectData {
+        pub hostname: ServerName<'static>,
+        pub connector: TlsConnector,
+    }
+
+    impl<S> TlsConnect<S> for RustlsConnect
+    where
+        S: AsyncRead + AsyncWrite + Unpin + Send + 'static,
+    {
+        type Stream = RustlsStream<S>;
+        type Error = io::Error;
+        type Future = TlsConnectFuture<S>;
+
+        fn connect(self, stream: S) -> Self::Future {
+            TlsConnectFuture {
+                inner: self.0.connector.connect(self.0.hostname, stream),
+            }
+        }
+    }
+
+    pub struct RustlsStream<S>(TlsStream<S>);
+
+    impl<S> tokio_postgres::tls::TlsStream for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn channel_binding(&self) -> ChannelBinding {
+            let (_, session) = self.0.get_ref();
+            match session.peer_certificates() {
+                Some([cert, ..]) => TlsServerEndPoint::new(cert)
+                    .ok()
+                    .and_then(|cb| match cb {
+                        TlsServerEndPoint::Sha256(hash) => Some(hash),
+                        TlsServerEndPoint::Undefined => None,
+                    })
+                    .map_or_else(ChannelBinding::none, |hash| {
+                        ChannelBinding::tls_server_end_point(hash.to_vec())
+                    }),
+                _ => ChannelBinding::none(),
+            }
+        }
+    }
+
+    impl<S> AsyncRead for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn poll_read(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+            buf: &mut ReadBuf<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_read(cx, buf)
+        }
+    }
+
+    impl<S> AsyncWrite for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn poll_write(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+            buf: &[u8],
+        ) -> Poll<tokio::io::Result<usize>> {
+            Pin::new(&mut self.0).poll_write(cx, buf)
+        }
+
+        fn poll_flush(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_flush(cx)
+        }
+
+        fn poll_shutdown(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_shutdown(cx)
+        }
+    }
+}
+
+/// A `MakeTlsConnect` implementation using `rustls`.
+///
+/// That way you can connect to PostgreSQL using `rustls` as the TLS stack.
+#[derive(Clone)]
+pub struct MakeRustlsConnect {
+    config: Arc<ClientConfig>,
+}
+
+impl MakeRustlsConnect {
+    /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`.
+    #[must_use]
+    pub fn new(config: ClientConfig) -> Self {
+        Self {
+            config: Arc::new(config),
+        }
+    }
+}
+
+impl<S> MakeTlsConnect<S> for MakeRustlsConnect
+where
+    S: AsyncRead + AsyncWrite + Unpin + Send + 'static,
+{
+    type Stream = private::RustlsStream<S>;
+    type TlsConnect = private::RustlsConnect;
+    type Error = rustls::pki_types::InvalidDnsNameError;
+
+    fn make_tls_connect(&mut self, hostname: &str) -> Result<Self::TlsConnect, Self::Error> {
+        ServerName::try_from(hostname).map(|dns_name| {
+            private::RustlsConnect(private::RustlsConnectData {
+                hostname: dns_name.to_owned(),
+                connector: Arc::clone(&self.config).into(),
+            })
+        })
+    }
+}
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3de8ca8736..2c2c2964b6 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -14,7 +14,6 @@ use rustls::pki_types;
 use tokio::io::DuplexStream;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::MakeRustlsConnect;
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -29,6 +28,7 @@ use crate::control_plane::{
     self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
+use crate::postgres_rustls::MakeRustlsConnect;
 use crate::types::{BranchId, EndpointId, ProjectId};
 use crate::{sasl, scram};
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 3037e20888..75909f3358 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -333,7 +333,7 @@ impl PoolingBackend {
             debug!("setting up backend session state");
 
             // initiates the auth session
-            if let Err(e) = client.query("select auth.init()", &[]).await {
+            if let Err(e) = client.execute("select auth.init()", &[]).await {
                 discard.discard();
                 return Err(e.into());
             }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index bd262f45ed..c302eac568 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -6,9 +6,10 @@ use std::task::{ready, Poll};
 use futures::future::poll_fn;
 use futures::Future;
 use smallvec::SmallVec;
+use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, Socket};
+use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 #[cfg(test)]
@@ -57,7 +58,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 9abe35db08..db9ac49dae 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -24,10 +24,11 @@ use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use serde_json::value::RawValue;
 use signature::Signer;
+use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
-use tokio_postgres::{AsyncMessage, Socket};
+use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
@@ -163,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
@@ -286,11 +287,11 @@ impl ClientInnerCommon<tokio_postgres::Client> {
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
             // initiates the auth session
-            self.inner.simple_query("discard all").await?;
+            self.inner.batch_execute("discard all").await?;
             self.inner
-                .query(
+                .execute(
                     "select auth.jwt_session_init($1)",
-                    &[&token as &(dyn ToSql + Sync)],
+                    &[&&*token as &(dyn ToSql + Sync)],
                 )
                 .await?;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a73d9d6352..c0a3abc377 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,6 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] }
 prost = { version = "0.13", features = ["prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -79,8 +78,7 @@ subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] }
+tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }

From ea3798e3b30f808f2851f58ff2390150b89959c6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 13:27:49 +0000
Subject: [PATCH 010/117] storage controller: use proper ScheduleContext when
 evacuating a node (#9908)

## Problem

When picking locations for a shard, we should use a ScheduleContext that
includes all the other shards in the tenant, so that we apply proper
anti-affinity between shards. If we don't do this, then it can lead to
unstable scheduling, where we place a shard somewhere that the optimizer
will then immediately move it away from.

We didn't always do this, because it was a bit awkward to accumulate the
context for a tenant rather than just walking tenants.

This was a TODO in `handle_node_availability_transition`:
```
                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                        // for tenants without secondary locations: if they have a secondary location, then this
                        // schedule() call is just promoting an existing secondary)
```

This is a precursor to https://github.com/neondatabase/neon/issues/8264,
where the current imperfect scheduling during node evacuation hampers
testing.

## Summary of changes

- Add an iterator type that yields each shard along with a
schedulecontext that includes all the other shards from the same tenant
- Use the iterator to replace hand-crafted logic in optimize_all_plan
(functionally identical)
- Use the iterator in `handle_node_availability_transition` to apply
proper anti-affinity during node evacuation.
---
 storage_controller/src/scheduler.rs           |  17 +-
 storage_controller/src/service.rs             | 200 +++++++-----------
 .../src/service/context_iterator.rs           | 139 ++++++++++++
 storage_controller/src/tenant_shard.rs        |  11 +-
 4 files changed, 245 insertions(+), 122 deletions(-)
 create mode 100644 storage_controller/src/service/context_iterator.rs

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 2414d95eb8..ecc6b11e47 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -305,7 +305,7 @@ impl std::ops::Add for AffinityScore {
 
 /// Hint for whether this is a sincere attempt to schedule, or a speculative
 /// check for where we _would_ schedule (done during optimization)
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum ScheduleMode {
     Normal,
     Speculative,
@@ -319,7 +319,7 @@ impl Default for ScheduleMode {
 
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
@@ -331,6 +331,14 @@ pub(crate) struct ScheduleContext {
 }
 
 impl ScheduleContext {
+    pub(crate) fn new(mode: ScheduleMode) -> Self {
+        Self {
+            nodes: HashMap::new(),
+            attached_nodes: HashMap::new(),
+            mode,
+        }
+    }
+
     /// Input is a list of nodes we would like to avoid using again within this context.  The more
     /// times a node is passed into this call, the less inclined we are to use it.
     pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
@@ -355,6 +363,11 @@ impl ScheduleContext {
     pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
         self.attached_nodes.get(&node_id).copied().unwrap_or(0)
     }
+
+    #[cfg(test)]
+    pub(crate) fn attach_count(&self) -> usize {
+        self.attached_nodes.values().sum()
+    }
 }
 
 pub(crate) enum RefCountUpdate {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 446c476b99..636ccf11a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,3 +1,6 @@
+pub mod chaos_injector;
+mod context_iterator;
+
 use hyper::Uri;
 use std::{
     borrow::Cow,
@@ -95,7 +98,7 @@ use crate::{
     },
 };
 
-pub mod chaos_injector;
+use context_iterator::TenantShardContextIterator;
 
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -5498,49 +5501,51 @@ impl Service {
 
                 let mut tenants_affected: usize = 0;
 
-                for (tenant_shard_id, tenant_shard) in tenants {
-                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
-                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                        // not assume our knowledge of the node's configuration is accurate until it comes back online
-                        observed_loc.conf = None;
-                    }
+                for (_tenant_id, mut schedule_context, shards) in
+                    TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+                {
+                    for tenant_shard in shards {
+                        let tenant_shard_id = tenant_shard.tenant_shard_id;
+                        if let Some(observed_loc) =
+                            tenant_shard.observed.locations.get_mut(&node_id)
+                        {
+                            // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                            // not assume our knowledge of the node's configuration is accurate until it comes back online
+                            observed_loc.conf = None;
+                        }
 
-                    if nodes.len() == 1 {
-                        // Special case for single-node cluster: there is no point trying to reschedule
-                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
-                        // failures to schedule them.
-                        continue;
-                    }
+                        if nodes.len() == 1 {
+                            // Special case for single-node cluster: there is no point trying to reschedule
+                            // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                            // failures to schedule them.
+                            continue;
+                        }
 
-                    if !nodes
-                        .values()
-                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
-                    {
-                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
-                        // trying to reschedule since there's nowhere else to go. Without this
-                        // branch we incorrectly detach tenants in response to node unavailability.
-                        continue;
-                    }
+                        if !nodes
+                            .values()
+                            .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                        {
+                            // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                            // trying to reschedule since there's nowhere else to go. Without this
+                            // branch we incorrectly detach tenants in response to node unavailability.
+                            continue;
+                        }
 
-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        tenant_shard.sequence = tenant_shard.sequence.next();
+                        if tenant_shard.intent.demote_attached(scheduler, node_id) {
+                            tenant_shard.sequence = tenant_shard.sequence.next();
 
-                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
-                        // for tenants without secondary locations: if they have a secondary location, then this
-                        // schedule() call is just promoting an existing secondary)
-                        let mut schedule_context = ScheduleContext::default();
-
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                // It is possible that some tenants will become unschedulable when too many pageservers
-                                // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantShard a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
-                            }
-                            Ok(()) => {
-                                if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
-                                    tenants_affected += 1;
-                                };
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    // It is possible that some tenants will become unschedulable when too many pageservers
+                                    // go offline: in this case there isn't much we can do other than make the issue observable.
+                                    // TODO: give TenantShard a scheduling error attribute to be queried later.
+                                    tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
+                                }
+                                Ok(()) => {
+                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                        tenants_affected += 1;
+                                    };
+                                }
                             }
                         }
                     }
@@ -6011,14 +6016,8 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
-        let mut schedule_context = ScheduleContext::default();
-
         let mut reconciles_spawned = 0;
-        for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_shard_zero() {
-                schedule_context = ScheduleContext::default();
-            }
-
+        for shard in tenants.values_mut() {
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
@@ -6033,8 +6032,6 @@ impl Service {
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
             }
-
-            schedule_context.avoid(&shard.intent.all_pageservers());
         }
 
         reconciles_spawned
@@ -6103,95 +6100,62 @@ impl Service {
     }
 
     fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
         // How many candidate optimizations we will generate, before evaluating them for readniess: setting
         // this higher than the execution limit gives us a chance to execute some work even if the first
         // few optimizations we find are not ready.
         const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
-
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
-        for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_shard_zero() {
-                // Reset accumulators on the first shard in a tenant
-                schedule_context = ScheduleContext::default();
-                schedule_context.mode = ScheduleMode::Speculative;
-                tenant_shards.clear();
-            }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
-                break;
-            }
-
-            match shard.get_scheduling_policy() {
-                ShardSchedulingPolicy::Active => {
-                    // Ok to do optimization
+        for (_tenant_id, schedule_context, shards) in
+            TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
+        {
+            for shard in shards {
+                if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
+                    break;
                 }
-                ShardSchedulingPolicy::Essential
-                | ShardSchedulingPolicy::Pause
-                | ShardSchedulingPolicy::Stop => {
-                    // Policy prevents optimizing this shard.
-                    continue;
+                match shard.get_scheduling_policy() {
+                    ShardSchedulingPolicy::Active => {
+                        // Ok to do optimization
+                    }
+                    ShardSchedulingPolicy::Essential
+                    | ShardSchedulingPolicy::Pause
+                    | ShardSchedulingPolicy::Stop => {
+                        // Policy prevents optimizing this shard.
+                        continue;
+                    }
                 }
-            }
 
-            // Accumulate the schedule context for all the shards in a tenant: we must have
-            // the total view of all shards before we can try to optimize any of them.
-            schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
-            tenant_shards.push(shard);
-
-            // Once we have seen the last shard in the tenant, proceed to search across all shards
-            // in the tenant for optimizations
-            if shard.shard.number.0 == shard.shard.count.count() - 1 {
-                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
+                if !matches!(shard.splitting, SplitState::Idle)
+                    || matches!(shard.policy, PlacementPolicy::Detached)
+                    || shard.reconciler.is_some()
+                {
                     // Do not start any optimizations while another change to the tenant is ongoing: this
                     // is not necessary for correctness, but simplifies operations and implicitly throttles
                     // optimization changes to happen in a "trickle" over time.
                     continue;
                 }
 
-                if tenant_shards.iter().any(|s| {
-                    !matches!(s.splitting, SplitState::Idle)
-                        || matches!(s.policy, PlacementPolicy::Detached)
-                }) {
-                    // Never attempt to optimize a tenant that is currently being split, or
-                    // a tenant that is meant to be detached
-                    continue;
-                }
-
                 // TODO: optimization calculations are relatively expensive: create some fast-path for
                 // the common idle case (avoiding the search on tenants that we have recently checked)
-
-                for shard in &tenant_shards {
-                    if let Some(optimization) =
-                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
-                        // its primary location based on soft constraints, cut it over.
-                        shard.optimize_attachment(nodes, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    } else if let Some(optimization) =
-                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
-                        // better placed on another node, based on ScheduleContext, then adjust it.  This
-                        // covers cases like after a shard split, where we might have too many shards
-                        // in the same tenant with secondary locations on the node where they originally split.
-                        shard.optimize_secondary(scheduler, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    }
-
-                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
-                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
-                    // for the total number of attachments on a node (not just within a tenant.)
+                if let Some(optimization) =
+                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // its primary location based on soft constraints, cut it over.
+                    shard.optimize_attachment(nodes, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
+                } else if let Some(optimization) =
+                    // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
+                    // better placed on another node, based on ScheduleContext, then adjust it.  This
+                    // covers cases like after a shard split, where we might have too many shards
+                    // in the same tenant with secondary locations on the node where they originally split.
+                    shard.optimize_secondary(scheduler, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
                 }
             }
         }
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
new file mode 100644
index 0000000000..d38010a27e
--- /dev/null
+++ b/storage_controller/src/service/context_iterator.rs
@@ -0,0 +1,139 @@
+use std::collections::BTreeMap;
+
+use utils::id::TenantId;
+use utils::shard::TenantShardId;
+
+use crate::scheduler::{ScheduleContext, ScheduleMode};
+use crate::tenant_shard::TenantShard;
+
+/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
+/// tenant while considering the individual shards within it.  This iterator is a helper
+/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
+/// for the tenant.
+pub(super) struct TenantShardContextIterator<'a> {
+    schedule_mode: ScheduleMode,
+    inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
+}
+
+impl<'a> TenantShardContextIterator<'a> {
+    pub(super) fn new(
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        schedule_mode: ScheduleMode,
+    ) -> Self {
+        Self {
+            schedule_mode,
+            inner: tenants.iter_mut(),
+        }
+    }
+}
+
+impl<'a> Iterator for TenantShardContextIterator<'a> {
+    type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut tenant_shards = Vec::new();
+        let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone());
+        loop {
+            let (tenant_shard_id, shard) = self.inner.next()?;
+
+            if tenant_shard_id.is_shard_zero() {
+                // Cleared on last shard of previous tenant
+                assert!(tenant_shards.is_empty());
+            }
+
+            // Accumulate the schedule context for all the shards in a tenant
+            schedule_context.avoid(&shard.intent.all_pageservers());
+            if let Some(attached) = shard.intent.get_attached() {
+                schedule_context.push_attached(*attached);
+            }
+            tenant_shards.push(shard);
+
+            if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
+                return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeMap, str::FromStr};
+
+    use pageserver_api::controller_api::PlacementPolicy;
+    use utils::shard::{ShardCount, ShardNumber};
+
+    use crate::{
+        scheduler::test_utils::make_test_nodes, service::Scheduler,
+        tenant_shard::tests::make_test_tenant_with_id,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_context_iterator() {
+        // Hand-crafted tenant IDs to ensure they appear in the expected order when put into
+        // a btreemap & iterated
+        let mut t_1_shards = make_test_tenant_with_id(
+            TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+        let t_2_shards = make_test_tenant_with_id(
+            TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(4),
+            None,
+        );
+        let mut t_3_shards = make_test_tenant_with_id(
+            TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+
+        let t1_id = t_1_shards[0].tenant_shard_id.tenant_id;
+        let t2_id = t_2_shards[0].tenant_shard_id.tenant_id;
+        let t3_id = t_3_shards[0].tenant_shard_id.tenant_id;
+
+        let mut tenants = BTreeMap::new();
+        tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap());
+        for shard in t_2_shards {
+            tenants.insert(shard.tenant_shard_id, shard);
+        }
+        tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap());
+
+        let nodes = make_test_nodes(3, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+        for shard in tenants.values_mut() {
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+        }
+
+        let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t1_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t2_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1));
+        assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
+        assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
+        assert_eq!(shards.len(), 4);
+        assert_eq!(context.attach_count(), 4);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t3_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        for shard in tenants.values_mut() {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+}
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 27c97d3b86..2eb98ee825 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1574,13 +1574,20 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(
+    pub(crate) fn make_test_tenant(
         policy: PlacementPolicy,
         shard_count: ShardCount,
         preferred_az: Option<AvailabilityZone>,
     ) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
+        make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az)
+    }
 
+    pub(crate) fn make_test_tenant_with_id(
+        tenant_id: TenantId,
+        policy: PlacementPolicy,
+        shard_count: ShardCount,
+        preferred_az: Option<AvailabilityZone>,
+    ) -> Vec<TenantShard> {
         (0..shard_count.count())
             .map(|i| {
                 let shard_number = ShardNumber(i);

From a6073b5013fb1513e1f9937642fb3610f62854dc Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 29 Nov 2024 15:38:04 +0200
Subject: [PATCH 011/117] safekeeper: use jemalloc (#9780)

## Problem

To add Safekeeper heap profiling in #9778, we need to switch to an
allocator that supports it. Pageserver and proxy already use jemalloc.

Touches #9534.

## Summary of changes

Use jemalloc in Safekeeper.
---
 Cargo.lock                        |  1 +
 safekeeper/Cargo.toml             |  1 +
 safekeeper/benches/receive_wal.rs | 30 +++++++++++++++++++++++++++++-
 safekeeper/src/bin/safekeeper.rs  |  3 +++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index f05c6311dd..abe69525c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5409,6 +5409,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 635a9222e1..0422c46ab1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,7 @@ serde_json.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index c637b4fb24..8c4281cf52 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -6,6 +6,7 @@ mod benchutils;
 use std::io::Write as _;
 
 use benchutils::Env;
+use bytes::BytesMut;
 use camino_tempfile::tempfile;
 use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
 use itertools::Itertools as _;
@@ -23,6 +24,9 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
@@ -30,7 +34,8 @@ criterion_group!(
     targets = bench_process_msg,
     bench_wal_acceptor,
     bench_wal_acceptor_throughput,
-    bench_file_write
+    bench_file_write,
+    bench_bytes_reserve,
 );
 criterion_main!(benches);
 
@@ -341,3 +346,26 @@ fn bench_file_write(c: &mut Criterion) {
         Ok(())
     }
 }
+
+/// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic
+/// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc.
+fn bench_bytes_reserve(c: &mut Criterion) {
+    let mut g = c.benchmark_group("bytes_reserve");
+    for size in [1, 64, KB, 8 * KB, 128 * KB] {
+        g.throughput(criterion::Throughput::Bytes(size as u64));
+        g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap());
+    }
+
+    fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> {
+        let mut bytes = BytesMut::new();
+        let data = vec![0; size];
+
+        b.iter(|| {
+            bytes.reserve(size);
+            bytes.extend_from_slice(&data);
+            bytes.split_to(size).freeze();
+        });
+
+        Ok(())
+    }
+}
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1248428d33..3659bcd7e0 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -48,6 +48,9 @@ use utils::{
     tcp_listener,
 };
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 

From 538e2312a617c65d489d391892c70b2e4d7407b5 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 29 Nov 2024 14:55:56 +0100
Subject: [PATCH 012/117] feat(compute_ctl): Always set application_name
 (#9934)

## Problem

It was not always possible to judge what exactly some `cloud_admin`
connections were doing because we didn't consistently set
`application_name` everywhere.

## Summary of changes

Unify the way we connect to Postgres:
1. Switch to building configs everywhere
2. Always set `application_name` and make naming consistent

Follow-up for #9919
Part of neondatabase/cloud#20948
---
 compute_tools/src/bin/compute_ctl.rs      | 10 ++++-
 compute_tools/src/catalog.rs              |  7 +---
 compute_tools/src/checker.rs              |  3 +-
 compute_tools/src/compute.rs              | 49 ++++++++++++++++-------
 compute_tools/src/http/api.rs             | 11 +++--
 compute_tools/src/installed_extensions.rs | 14 +++----
 compute_tools/src/monitor.rs              | 13 +++---
 7 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6b670de2ea..b178d7abd6 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -37,6 +37,7 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
@@ -322,8 +323,15 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
+    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build postgres config from connstr")?;
+    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build tokio postgres config from connstr")?;
     let compute_node = ComputeNode {
-        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        connstr,
+        conn_conf,
+        tokio_conn_conf,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 08ae8bf44d..72198a9479 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -6,7 +6,6 @@ use tokio::{
     process::Command,
     spawn,
 };
-use tokio_postgres::connect;
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
@@ -16,10 +15,8 @@ use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgr
 use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
-    let connstr = compute.connstr.clone();
-
-    let (client, connection): (tokio_postgres::Client, _) =
-        connect(connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles"));
+    let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?;
 
     spawn(async move {
         if let Err(e) = connection.await {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index cec2b1bed8..62d61a8bc9 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -9,7 +9,8 @@ use crate::compute::ComputeNode;
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     // Connect to the database.
-    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker"));
+    let (client, connection) = conf.connect(NoTls).await?;
     if client.is_closed() {
         return Err(anyhow!("connection to postgres closed"));
     }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1a026a4014..da1caf1a9b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,8 +20,9 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use nix::unistd::Pid;
+use postgres;
 use postgres::error::SqlState;
-use postgres::{Client, NoTls};
+use postgres::NoTls;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -58,6 +59,10 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 pub struct ComputeNode {
     // Url type maintains proper escaping
     pub connstr: url::Url,
+    // We connect to Postgres from many different places, so build configs once
+    // and reuse them where needed.
+    pub conn_conf: postgres::config::Config,
+    pub tokio_conn_conf: tokio_postgres::config::Config,
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
@@ -800,10 +805,10 @@ impl ComputeNode {
     /// version. In the future, it may upgrade all 3rd-party extensions.
     #[instrument(skip_all)]
     pub fn post_apply_config(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
+        let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
         thread::spawn(move || {
             let func = || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                let mut client = conf.connect(NoTls)?;
                 handle_neon_extension_upgrade(&mut client)
                     .context("handle_neon_extension_upgrade")?;
                 Ok::<_, anyhow::Error>(())
@@ -815,12 +820,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config {
+        let mut conf = self.conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
+    pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config {
+        let mut conf = self.tokio_conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
     async fn get_maintenance_client(
         conf: &tokio_postgres::Config,
     ) -> Result<tokio_postgres::Client> {
         let mut conf = conf.clone();
-
-        conf.application_name("apply_config");
+        conf.application_name("compute_ctl:apply_config");
 
         let (client, conn) = match conf.connect(NoTls).await {
             // If connection fails, it may be the old node with `zenith_admin` superuser.
@@ -837,6 +857,7 @@ impl ComputeNode {
                         e
                     );
                     let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.application_name("compute_ctl:apply_config");
                     zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
@@ -1134,8 +1155,7 @@ impl ComputeNode {
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
-        conf.application_name("apply_config");
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
 
         let conf = Arc::new(conf);
         let spec = Arc::new(
@@ -1161,7 +1181,7 @@ impl ComputeNode {
         thread::spawn(move || {
             let conf = conf.as_ref().clone();
             let mut conf = postgres::config::Config::from(conf);
-            conf.application_name("migrations");
+            conf.application_name("compute_ctl:migrations");
 
             let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
@@ -1369,9 +1389,9 @@ impl ComputeNode {
             }
             self.post_apply_config()?;
 
-            let connstr = self.connstr.clone();
+            let conf = self.get_conn_conf(None);
             thread::spawn(move || {
-                let res = get_installed_extensions(&connstr);
+                let res = get_installed_extensions(conf);
                 match res {
                     Ok(extensions) => {
                         info!(
@@ -1510,7 +1530,8 @@ impl ComputeNode {
     /// Select `pg_stat_statements` data and return it as a stringified JSON
     pub async fn collect_insights(&self) -> String {
         let mut result_rows: Vec<String> = Vec::new();
-        let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights"));
+        let connect_result = conf.connect(NoTls).await;
         let (client, connection) = connect_result.unwrap();
         tokio::spawn(async move {
             if let Err(e) = connection.await {
@@ -1636,10 +1657,9 @@ LIMIT 100",
         privileges: &[Privilege],
         role_name: &PgIdent,
     ) -> Result<()> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
@@ -1676,10 +1696,9 @@ LIMIT 100",
         db_name: &PgIdent,
         ext_version: ExtVersion,
     ) -> Result<ExtVersion> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index a6c6cff20a..7fa6426d8f 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -295,12 +295,11 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                 return Response::new(Body::from(msg));
             }
 
-            let connstr = compute.connstr.clone();
-            let res = task::spawn_blocking(move || {
-                installed_extensions::get_installed_extensions(&connstr)
-            })
-            .await
-            .unwrap();
+            let conf = compute.get_conn_conf(None);
+            let res =
+                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+                    .await
+                    .unwrap();
 
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index f473c29a55..5f62f08858 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -10,8 +10,6 @@ use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
-use crate::pg_helpers::postgres_conf_for_db;
-
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -41,14 +39,16 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
-    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
+    conf.application_name("compute_ctl:get_installed_extensions");
+    let mut client = conf.connect(NoTls)?;
+
     let databases: Vec<String> = list_dbs(&mut client)?;
 
     let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
     for db in databases.iter() {
-        let config = postgres_conf_for_db(connstr, db)?;
-        let mut db_client = config.connect(NoTls)?;
+        conf.dbname(db);
+        let mut db_client = conf.connect(NoTls)?;
         let extensions: Vec<(String, String)> = db_client
             .query(
                 "SELECT extname, extversion FROM pg_catalog.pg_extension;",
@@ -82,7 +82,7 @@ pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtension
     }
 
     let res = InstalledExtensions {
-        extensions: extensions_map.values().cloned().collect(),
+        extensions: extensions_map.into_values().collect(),
     };
 
     Ok(res)
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index d7127aac32..184f380a8d 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,11 +17,8 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let mut connstr = compute.connstr.clone();
-    connstr
-        .query_pairs_mut()
-        .append_pair("application_name", "compute_activity_monitor");
-    let connstr = connstr.as_str();
+    let connstr = compute.connstr.clone();
+    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
 
     // During startup and configuration we connect to every Postgres database,
     // but we don't want to count this as some user activity. So wait until
@@ -29,7 +26,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
     wait_for_postgres_start(compute);
 
     // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(connstr, NoTls);
+    let mut client = conf.connect(NoTls);
 
     let mut sleep = false;
     let mut prev_active_time: Option<f64> = None;
@@ -57,7 +54,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     info!("connection to Postgres is closed, trying to reconnect");
 
                     // Connection is closed, reconnect and try again.
-                    client = Client::connect(connstr, NoTls);
+                    client = conf.connect(NoTls);
                     continue;
                 }
 
@@ -196,7 +193,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                 debug!("could not connect to Postgres: {}, retrying", e);
 
                 // Establish a new connection and try again.
-                client = Client::connect(connstr, NoTls);
+                client = conf.connect(NoTls);
             }
         }
     }

From d5624cc50521098d16a49ad92a735184a48981ae Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 15:11:44 +0000
Subject: [PATCH 013/117] pageserver: download small objects using a smaller
 timeout (#9938)

## Problem

It appears that the Azure storage API tends to hang TCP connections more
than S3 does.

Currently we use a 2 minute timeout for all downloads. This is large
because sometimes the objects we download are large. However, waiting 2
minutes when doing something like downloading a manifest on tenant
attach is problematic, because when someone is doing a "create tenant,
create timeline" workflow, that 2 minutes is long enough for them
reasonably to give up creating that timeline.

Rather than propagate oversized timeouts further up the stack, we should
use a different timeout for objects that we expect to be small.

Closes: https://github.com/neondatabase/neon/issues/9836

## Summary of changes

- Add a `small_timeout` configuration attribute to remote storage,
defaulting to 30 seconds (still a very generous period to do something
like download an index)
- Add a DownloadKind parameter to DownloadOpts, so that callers can
indicate whether they expect the object to be small or large.
- In the azure client, use small timeout for HEAD requests, and for GET
requests if DownloadKind::Small is used.
- Use DownloadKind::Small for manifests, indices, and heatmap downloads.

This PR intentionally does not make the equivalent change to the S3
client, to reduce blast radius in case this has unexpected consequences
(we could accomplish the same thing by editing lots of configs, but just
skipping the code is simpler for right now)
---
 libs/remote_storage/src/azure_blob.rs         | 23 ++++++++++++++---
 libs/remote_storage/src/config.rs             | 25 ++++++++++++++++---
 libs/remote_storage/src/lib.rs                | 20 ++++++++++++++-
 libs/remote_storage/tests/test_real_azure.rs  |  3 ++-
 libs/remote_storage/tests/test_real_s3.rs     |  1 +
 pageserver/src/deletion_queue.rs              |  1 +
 pageserver/src/tenant.rs                      |  1 +
 .../tenant/remote_timeline_client/download.rs | 21 +++++++++++++---
 pageserver/src/tenant/secondary/downloader.rs |  3 ++-
 .../import_pgdata/importbucket_client.rs      |  4 ++-
 proxy/src/context/parquet.rs                  |  2 ++
 11 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 840917ef68..8d1962fa29 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -35,6 +35,7 @@ use utils::backoff;
 use utils::backoff::exponential_backoff_duration_seconds;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::DownloadKind;
 use crate::{
     config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
     DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
@@ -49,10 +50,17 @@ pub struct AzureBlobStorage {
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
+
+    // Alternative timeout used for metadata objects which are expected to be small
+    pub small_timeout: Duration,
 }
 
 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
+    pub fn new(
+        azure_config: &AzureConfig,
+        timeout: Duration,
+        small_timeout: Duration,
+    ) -> Result<Self> {
         debug!(
             "Creating azure remote storage for azure container {}",
             azure_config.container_name
@@ -94,6 +102,7 @@ impl AzureBlobStorage {
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
             timeout,
+            small_timeout,
         })
     }
 
@@ -133,6 +142,7 @@ impl AzureBlobStorage {
     async fn download_for_builder(
         &self,
         builder: GetBlobBuilder,
+        timeout: Duration,
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let kind = RequestKind::Get;
@@ -156,7 +166,7 @@ impl AzureBlobStorage {
                 .map_err(to_download_error);
 
             // apply per request timeout
-            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = tokio_stream::StreamExt::timeout(response, timeout);
 
             // flatten
             let response = response.map(|res| match res {
@@ -415,7 +425,7 @@ impl RemoteStorage for AzureBlobStorage {
         let blob_client = self.client.blob_client(self.relative_path_to_name(key));
         let properties_future = blob_client.get_properties().into_future();
 
-        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+        let properties_future = tokio::time::timeout(self.small_timeout, properties_future);
 
         let res = tokio::select! {
             res = properties_future => res,
@@ -521,7 +531,12 @@ impl RemoteStorage for AzureBlobStorage {
             });
         }
 
-        self.download_for_builder(builder, cancel).await
+        let timeout = match opts.kind {
+            DownloadKind::Small => self.small_timeout,
+            DownloadKind::Large => self.timeout,
+        };
+
+        self.download_for_builder(builder, timeout, cancel).await
     }
 
     async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index e99ae4f747..f6ef31077c 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -24,6 +24,13 @@ pub struct RemoteStorageConfig {
         skip_serializing_if = "is_default_timeout"
     )]
     pub timeout: Duration,
+    /// Alternative timeout used for metadata objects which are expected to be small
+    #[serde(
+        with = "humantime_serde",
+        default = "default_small_timeout",
+        skip_serializing_if = "is_default_small_timeout"
+    )]
+    pub small_timeout: Duration,
 }
 
 impl RemoteStorageKind {
@@ -40,10 +47,18 @@ fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn default_small_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 fn is_default_timeout(d: &Duration) -> bool {
     *d == RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn is_default_small_timeout(d: &Duration) -> bool {
+    *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 /// A kind of a remote storage to connect to, with its connection configuration.
 #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
 #[serde(untagged)]
@@ -184,6 +199,7 @@ fn serialize_storage_class<S: serde::Serializer>(
 
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+    pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30);
 
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
         Ok(utils::toml_edit_ext::deserialize_item(toml)?)
@@ -219,7 +235,8 @@ timeout = '5s'";
                 storage: RemoteStorageKind::LocalFs {
                     local_path: Utf8PathBuf::from(".")
                 },
-                timeout: Duration::from_secs(5)
+                timeout: Duration::from_secs(5),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -247,7 +264,8 @@ timeout = '5s'";
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                     upload_storage_class: Some(StorageClass::IntelligentTiering),
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -299,7 +317,8 @@ timeout = '5s'";
                     concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 719608dd5f..0ece29d99e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -178,6 +178,15 @@ pub struct DownloadOpts {
     /// The end of the byte range to download, or unbounded. Must be after the
     /// start bound.
     pub byte_end: Bound<u64>,
+    /// Indicate whether we're downloading something small or large: this indirectly controls
+    /// timeouts: for something like an index/manifest/heatmap, we should time out faster than
+    /// for layer files
+    pub kind: DownloadKind,
+}
+
+pub enum DownloadKind {
+    Large,
+    Small,
 }
 
 impl Default for DownloadOpts {
@@ -186,6 +195,7 @@ impl Default for DownloadOpts {
             etag: Default::default(),
             byte_start: Bound::Unbounded,
             byte_end: Bound::Unbounded,
+            kind: DownloadKind::Large,
         }
     }
 }
@@ -584,6 +594,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 impl GenericRemoteStorage {
     pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
+
+        // If somkeone overrides timeout to be small without adjusting small_timeout, then adjust it automatically
+        let small_timeout = std::cmp::min(storage_config.small_timeout, timeout);
+
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs { local_path: path } => {
                 info!("Using fs root '{path}' as a remote storage");
@@ -606,7 +620,11 @@ impl GenericRemoteStorage {
                     .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
                 info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(
+                    azure_config,
+                    timeout,
+                    small_timeout,
+                )?))
             }
         })
     }
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 3a20649490..92d579fec8 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -219,7 +219,8 @@ async fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
-        timeout: Duration::from_secs(120),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 3e99a65fac..e60ec18c93 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -396,6 +396,7 @@ async fn create_s3_client(
             upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e74c8ecf5a..1d508f5fe9 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -838,6 +838,7 @@ mod test {
                 local_path: remote_fs_dir.clone(),
             },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+            small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config)
             .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 339a3ca1bb..cd0690bb1a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5423,6 +5423,7 @@ pub(crate) mod harness {
                     local_path: remote_fs_dir.clone(),
                 },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d632e595ad..739615be9c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -30,7 +30,9 @@ use crate::tenant::Generation;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{
+    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
+};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
@@ -345,12 +347,13 @@ pub async fn list_remote_timelines(
 async fn do_download_remote_path_retry_forever(
     storage: &GenericRemoteStorage,
     remote_path: &RemotePath,
+    download_opts: DownloadOpts,
     cancel: &CancellationToken,
 ) -> Result<(Vec<u8>, SystemTime), DownloadError> {
     download_retry_forever(
         || async {
             let download = storage
-                .download(remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &download_opts, cancel)
                 .await?;
 
             let mut bytes = Vec::new();
@@ -377,8 +380,13 @@ async fn do_download_tenant_manifest(
 ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (manifest_bytes, manifest_bytes_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
         .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
@@ -398,8 +406,13 @@ async fn do_download_index_part(
         timeline_id.expect("A timeline ID is always provided when downloading an index");
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (index_part_bytes, index_part_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 7443261a9c..8d771dc405 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -49,7 +49,7 @@ use futures::Future;
 use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -946,6 +946,7 @@ impl<'a> TenantDownloader<'a> {
         let cancel = &self.secondary_state.cancel;
         let opts = DownloadOpts {
             etag: prev_etag.cloned(),
+            kind: DownloadKind::Small,
             ..Default::default()
         };
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index 8d5ab1780f..bc4d148a29 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -4,7 +4,8 @@ use anyhow::Context;
 use bytes::Bytes;
 use postgres_ffi::ControlFileData;
 use remote_storage::{
-    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
+    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
+    ListingObject, RemotePath,
 };
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -239,6 +240,7 @@ impl RemoteStorageWrapper {
                     .download(
                         path,
                         &DownloadOpts {
+                            kind: DownloadKind::Large,
                             etag: None,
                             byte_start: Bound::Included(start_inclusive),
                             byte_end: Bound::Excluded(end_exclusive)
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e328c6de79..b375eb886e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -486,6 +486,7 @@ mod tests {
                     upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             })
         );
         assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -545,6 +546,7 @@ mod tests {
                 local_path: tmpdir.to_path_buf(),
             },
             timeout: std::time::Duration::from_secs(120),
+            small_timeout: std::time::Duration::from_secs(30),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config)
             .await

From c848f25ec25e04afba9f2b0509372504b35cafe9 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:58:36 +0000
Subject: [PATCH 014/117] Fixed fast_import pgbin in calling get_pg_version
 (#9933)

Was working on https://github.com/neondatabase/cloud/pull/20795 and
discovered that fast_import is not working normally.
---
 compute_tools/src/bin/fast_import.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 6716cc6234..b6db3eb11a 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -21,7 +21,7 @@
 //! - Build the image with the following command:
 //!
 //! ```bash
-//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)"  -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com
+//! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile .
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```
 
@@ -132,7 +132,8 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     //  Initialize pgdata
     //
-    let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
+    let pgbin = pg_bin_dir.join("postgres");
+    let pg_version = match get_pg_version(pgbin.as_ref()) {
         PostgresMajorVersion::V14 => 14,
         PostgresMajorVersion::V15 => 15,
         PostgresMajorVersion::V16 => 16,
@@ -155,7 +156,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     // Launch postgres process
     //
-    let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres"))
+    let mut postgres_proc = tokio::process::Command::new(pgbin)
         .arg("-D")
         .arg(&pgdata_dir)
         .args(["-c", "wal_level=minimal"])

From 973a8d2680f968e83e5668e69c87636189146e54 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 29 Nov 2024 20:10:26 +0100
Subject: [PATCH 015/117] Fix timeout value used in XLogWaitForReplayOf (#9937)

The previous value assumed usec precision, while the timeout used is in
milliseconds, causing replica backends to wait for (potentially) many
hours for WAL replay without the expected progress reports in logs.

This fixes the issue.

Reported-By: Alexander Lakhin <exclusion@gmail.com>

## Problem


https://github.com/neondatabase/postgres/pull/279#issuecomment-2507671817

The timeout value was configured with the assumption the indicated value
would be microseconds, where it's actually milliseconds. That causes the
backend to wait for much longer (2h46m40s) before it emits the "I'm
waiting for recovery" message. While we do have wait events configured
on this, it's not great to have stuck backends without clear logs, so
this fixes the timeout value in all our PostgreSQL branches.

## PG PRs

* PG14: https://github.com/neondatabase/postgres/pull/542
* PG15: https://github.com/neondatabase/postgres/pull/543
* PG16: https://github.com/neondatabase/postgres/pull/544
* PG17: https://github.com/neondatabase/postgres/pull/545
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 284ae56be2..c1989c934d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5
+Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index aed79ee87b..d929b9a8b9 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05
+Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f5cfc6fa89..13e9e35394 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff
+Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 3c15b6565f..faebe5e5af 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f
+Subproject commit faebe5e5aff5687908504453623778f8515529db
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 4dae88e73d..abeddcadf7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f"
+    "faebe5e5aff5687908504453623778f8515529db"
   ],
   "v16": [
     "16.6",
-    "f5cfc6fa898544050e821ac688adafece1ac3cff"
+    "13e9e3539419003e79bd9aa29e1bc44f3fd555dd"
   ],
   "v15": [
     "15.10",
-    "aed79ee87b94779cc52ec13e3b74eba6ada93f05"
+    "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6"
   ],
   "v14": [
     "14.15",
-    "284ae56be2397fd3eaf20777fa220b2d0ad968f5"
+    "c1989c934d46e04e78b3c496c8a34bcd40ddceeb"
   ]
 }

From aa4ec11af9c982a4022074f18a05745d91633bca Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 30 Nov 2024 01:16:24 +0100
Subject: [PATCH 016/117] page_service: rewrite batching to work without a
 timeout (#9851)

# Problem

The timeout-based batching adds latency to unbatchable workloads.

We can choose a short batching timeout (e.g. 10us) but that requires
high-resolution timers, which tokio doesn't have.
I thoroughly explored options to use OS timers (see
[this](https://github.com/neondatabase/neon/pull/9822) abandoned PR).
In short, it's not an attractive option because any timer implementation
adds non-trivial overheads.

# Solution

The insight is that, in the steady state of a batchable workload, the
time we spend in `get_vectored` will be hundreds of microseconds anyway.

If we prepare the next batch concurrently to `get_vectored`, we will
have a sizeable batch ready once `get_vectored` of the current batch is
done and do not need an explicit timeout.

This can be reasonably described as **pipelining of the protocol
handler**.

# Implementation

We model the sub-protocol handler for pagestream requests
(`handle_pagrequests`) as two futures that form a pipeline:

2. Batching: read requests from the connection and fill the current
batch
3. Execution: `take` the current batch, execute it using `get_vectored`,
and send the response.

The Reading and Batching stage are connected through a new type of
channel called `spsc_fold`.

See the long comment in the `handle_pagerequests_pipelined` for details.

# Changes

- Refactor `handle_pagerequests`
    - separate functions for
- reading one protocol message; produces a `BatchedFeMessage` with just
one page request in it
- batching; tried to merge an incoming `BatchedFeMessage` into an
existing `BatchedFeMessage`; returns `None` on success and returns back
the incoming message in case merging isn't possible
        - execution of a batched message
- unify the timeline handle acquisition & request span construction; it
now happen in the function that reads the protocol message
- Implement serial and pipelined model
    - serial: what we had before any of the batching changes
      - read one protocol message
      - execute protocol messages
    - pipelined: the design described above
- optionality for execution of the pipeline: either via concurrent
futures vs tokio tasks
- Pageserver config
  - remove batching timeout field
  - add ability to configure pipelining mode
- add ability to limit max batch size for pipelined configurations
(required for the rollout, cf
https://github.com/neondatabase/cloud/issues/20620 )
  - ability to configure execution mode
- Tests
  - remove `batch_timeout` parametrization
  - rename `test_getpage_merge_smoke` to `test_throughput`
- add parametrization to test different max batch sizes and execution
moes
  - rename `test_timer_precision` to `test_latency`
  - rename the test case file to `test_page_service_batching.py`
  - better descriptions of what the tests actually do

## On the holding The `TimelineHandle` in the pending batch

While batching, we hold the `TimelineHandle` in the pending batch.
Therefore, the timeline will not finish shutting down while we're
batching.

This is not a problem in practice because the concurrently ongoing
`get_vectored` call will fail quickly with an error indicating that the
timeline is shutting down.
This results in the Execution stage returning a `QueryError::Shutdown`,
which causes the pipeline / entire page service connection to shut down.
This drops all references to the
`Arc<Mutex<Option<Box<BatchedFeMessage>>>>` object, thereby dropping the
contained `TimelineHandle`s.

- => fixes https://github.com/neondatabase/neon/issues/9850

# Performance

Local run of the benchmarks, results in [this empty
commit](https://github.com/neondatabase/neon/pull/9851/commits/1cf5b1463f69ba5066cbb0713912aec7bb5579ad)
in the PR branch.

Key take-aways:
* `concurrent-futures` and `tasks` deliver identical `batching_factor`
* tail latency impact unknown, cf
https://github.com/neondatabase/neon/issues/9837
* `concurrent-futures` has higher throughput than `tasks` in all
workloads (=lower `time` metric)
* In unbatchable workloads, `concurrent-futures` has 5% higher
`CPU-per-throughput` than that of `tasks`, and 15% higher than that of
`serial`.
* In batchable-32 workload, `concurrent-futures` has 8% lower
`CPU-per-throughput` than that of `tasks` (comparison to tput of
`serial` is irrelevant)
* in unbatchable workloads, mean and tail latencies of
`concurrent-futures` is practically identical to `serial`, whereas
`tasks` adds 20-30us of overhead

Overall, `concurrent-futures` seems like a slightly more attractive
choice.

# Rollout

This change is disabled-by-default.

Rollout plan:
- https://github.com/neondatabase/cloud/issues/20620

# Refs

- epic: https://github.com/neondatabase/neon/issues/9376
- this sub-task: https://github.com/neondatabase/neon/issues/9377
- the abandoned attempt to improve batching timeout resolution:
https://github.com/neondatabase/neon/pull/9820
- closes https://github.com/neondatabase/neon/issues/9850
- fixes https://github.com/neondatabase/neon/issues/9835
---
 Cargo.lock                                    |   10 +-
 Cargo.toml                                    |    1 +
 libs/pageserver_api/src/config.rs             |   30 +-
 libs/utils/Cargo.toml                         |    2 +
 libs/utils/src/sync.rs                        |    2 +
 libs/utils/src/sync/spsc_fold.rs              |  452 +++++++
 pageserver/src/config.rs                      |   10 +-
 pageserver/src/lib.rs                         |   19 +
 pageserver/src/page_service.rs                | 1059 ++++++++++-------
 test_runner/fixtures/neon_fixtures.py         |    3 +-
 ...merge.py => test_page_service_batching.py} |  131 +-
 11 files changed, 1262 insertions(+), 457 deletions(-)
 create mode 100644 libs/utils/src/sync/spsc_fold.rs
 rename test_runner/performance/pageserver/{test_pageserver_getpage_merge.py => test_page_service_batching.py} (69%)

diff --git a/Cargo.lock b/Cargo.lock
index abe69525c9..313222cf3c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "RustyXML"
@@ -1717,6 +1717,12 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "diatomic-waker"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c"
+
 [[package]]
 name = "diesel"
 version = "2.2.3"
@@ -7045,6 +7051,7 @@ dependencies = [
  "chrono",
  "const_format",
  "criterion",
+ "diatomic-waker",
  "fail",
  "futures",
  "git-version",
@@ -7063,6 +7070,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "routerify",
+ "scopeguard",
  "sentry",
  "serde",
  "serde_assert",
diff --git a/Cargo.toml b/Cargo.toml
index 742201d0f5..64c384f17a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,6 +83,7 @@ comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
+diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 721d97404b..e49d15ba87 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -118,9 +118,8 @@ pub struct ConfigToml {
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub no_sync: Option<bool>,
-    #[serde(with = "humantime_serde")]
-    pub server_side_batch_timeout: Option<Duration>,
     pub wal_receiver_protocol: PostgresClientProtocol,
+    pub page_service_pipelining: PageServicePipeliningConfig,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -137,6 +136,28 @@ pub struct DiskUsageEvictionTaskConfig {
     pub eviction_order: EvictionOrder,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
+#[serde(deny_unknown_fields)]
+pub enum PageServicePipeliningConfig {
+    Serial,
+    Pipelined(PageServicePipeliningConfigPipelined),
+}
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct PageServicePipeliningConfigPipelined {
+    /// Causes runtime errors if larger than max get_vectored batch size.
+    pub max_batch_size: NonZeroUsize,
+    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedExecutionStrategy {
+    ConcurrentFutures,
+    Tasks,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -332,8 +353,6 @@ pub mod defaults {
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
-    pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
-
     pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
         utils::postgres_client::PostgresClientProtocol::Vanilla;
 }
@@ -420,11 +439,10 @@ impl Default for ConfigToml {
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
-            server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT
-                .map(|duration| humantime::parse_duration(duration).unwrap()),
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
+            page_service_pipelining: PageServicePipeliningConfig::Serial,
         }
     }
 }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index f440b81d8f..5648072a83 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,6 +19,7 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
+diatomic-waker.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
@@ -45,6 +46,7 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
+scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs
index 2ee8f35449..7aa26e24bc 100644
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,3 +1,5 @@
 pub mod heavier_once_cell;
 
 pub mod gate;
+
+pub mod spsc_fold;
diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs
new file mode 100644
index 0000000000..b44f766ef0
--- /dev/null
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -0,0 +1,452 @@
+use core::{future::poll_fn, task::Poll};
+use std::sync::{Arc, Mutex};
+
+use diatomic_waker::DiatomicWaker;
+
+pub struct Sender<T> {
+    state: Arc<Inner<T>>,
+}
+
+pub struct Receiver<T> {
+    state: Arc<Inner<T>>,
+}
+
+struct Inner<T> {
+    wake_receiver: DiatomicWaker,
+    wake_sender: DiatomicWaker,
+    value: Mutex<State<T>>,
+}
+
+enum State<T> {
+    NoData,
+    HasData(T),
+    TryFoldFailed, // transient state
+    SenderWaitsForReceiverToConsume(T),
+    SenderGone(Option<T>),
+    ReceiverGone,
+    AllGone,
+    SenderDropping,   // transient state
+    ReceiverDropping, // transient state
+}
+
+pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
+    let inner = Inner {
+        wake_receiver: DiatomicWaker::new(),
+        wake_sender: DiatomicWaker::new(),
+        value: Mutex::new(State::NoData),
+    };
+
+    let state = Arc::new(inner);
+    (
+        Sender {
+            state: state.clone(),
+        },
+        Receiver { state },
+    )
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SendError {
+    #[error("receiver is gone")]
+    ReceiverGone,
+}
+
+impl<T: Send> Sender<T> {
+    /// # Panics
+    ///
+    /// If `try_fold` panics,  any subsequent call to `send` panic.
+    pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
+    where
+        F: Fn(&mut T, T) -> Result<(), T>,
+    {
+        let mut value = Some(value);
+        poll_fn(|cx| {
+            let mut guard = self.state.value.lock().unwrap();
+            match &mut *guard {
+                State::NoData => {
+                    *guard = State::HasData(value.take().unwrap());
+                    self.state.wake_receiver.notify();
+                    Poll::Ready(Ok(()))
+                }
+                State::HasData(_) => {
+                    let State::HasData(acc_mut) = &mut *guard else {
+                        unreachable!("this match arm guarantees that the guard is HasData");
+                    };
+                    match try_fold(acc_mut, value.take().unwrap()) {
+                        Ok(()) => {
+                            // no need to wake receiver, if it was waiting it already
+                            // got a wake-up when we transitioned from NoData to HasData
+                            Poll::Ready(Ok(()))
+                        }
+                        Err(unfoldable_value) => {
+                            value = Some(unfoldable_value);
+                            let State::HasData(acc) =
+                                std::mem::replace(&mut *guard, State::TryFoldFailed)
+                            else {
+                                unreachable!("this match arm guarantees that the guard is HasData");
+                            };
+                            *guard = State::SenderWaitsForReceiverToConsume(acc);
+                            // SAFETY: send is single threaded due to `&mut self` requirement,
+                            // therefore register is not concurrent.
+                            unsafe {
+                                self.state.wake_sender.register(cx.waker());
+                            }
+                            Poll::Pending
+                        }
+                    }
+                }
+                State::SenderWaitsForReceiverToConsume(_data) => {
+                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
+                    Poll::Pending
+                }
+                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
+                State::SenderGone(_)
+                | State::AllGone
+                | State::SenderDropping
+                | State::ReceiverDropping
+                | State::TryFoldFailed => {
+                    unreachable!();
+                }
+            }
+        })
+        .await
+    }
+}
+
+impl<T> Drop for Sender<T> {
+    fn drop(&mut self) {
+        scopeguard::defer! {
+            self.state.wake_receiver.notify()
+        };
+        let Ok(mut guard) = self.state.value.lock() else {
+            return;
+        };
+        *guard = match std::mem::replace(&mut *guard, State::SenderDropping) {
+            State::NoData => State::SenderGone(None),
+            State::HasData(data) | State::SenderWaitsForReceiverToConsume(data) => {
+                State::SenderGone(Some(data))
+            }
+            State::ReceiverGone => State::AllGone,
+            State::TryFoldFailed
+            | State::SenderGone(_)
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => {
+                unreachable!("unreachable state {:?}", guard.discriminant_str())
+            }
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum RecvError {
+    #[error("sender is gone")]
+    SenderGone,
+}
+
+impl<T: Send> Receiver<T> {
+    pub async fn recv(&mut self) -> Result<T, RecvError> {
+        poll_fn(|cx| {
+            let mut guard = self.state.value.lock().unwrap();
+            match &mut *guard {
+                State::NoData => {
+                    // SAFETY: recv is single threaded due to `&mut self` requirement,
+                    // therefore register is not concurrent.
+                    unsafe {
+                        self.state.wake_receiver.register(cx.waker());
+                    }
+                    Poll::Pending
+                }
+                guard @ State::HasData(_)
+                | guard @ State::SenderWaitsForReceiverToConsume(_)
+                | guard @ State::SenderGone(Some(_)) => {
+                    let data = guard
+                        .take_data()
+                        .expect("in these states, data is guaranteed to be present");
+                    self.state.wake_sender.notify();
+                    Poll::Ready(Ok(data))
+                }
+                State::SenderGone(None) => Poll::Ready(Err(RecvError::SenderGone)),
+                State::ReceiverGone
+                | State::AllGone
+                | State::SenderDropping
+                | State::ReceiverDropping
+                | State::TryFoldFailed => {
+                    unreachable!("unreachable state {:?}", guard.discriminant_str());
+                }
+            }
+        })
+        .await
+    }
+}
+
+impl<T> Drop for Receiver<T> {
+    fn drop(&mut self) {
+        scopeguard::defer! {
+            self.state.wake_sender.notify()
+        };
+        let Ok(mut guard) = self.state.value.lock() else {
+            return;
+        };
+        *guard = match std::mem::replace(&mut *guard, State::ReceiverDropping) {
+            State::NoData => State::ReceiverGone,
+            State::HasData(_) | State::SenderWaitsForReceiverToConsume(_) => State::ReceiverGone,
+            State::SenderGone(_) => State::AllGone,
+            State::TryFoldFailed
+            | State::ReceiverGone
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => {
+                unreachable!("unreachable state {:?}", guard.discriminant_str())
+            }
+        }
+    }
+}
+
+impl<T> State<T> {
+    fn take_data(&mut self) -> Option<T> {
+        match self {
+            State::HasData(_) => {
+                let State::HasData(data) = std::mem::replace(self, State::NoData) else {
+                    unreachable!("this match arm guarantees that the state is HasData");
+                };
+                Some(data)
+            }
+            State::SenderWaitsForReceiverToConsume(_) => {
+                let State::SenderWaitsForReceiverToConsume(data) =
+                    std::mem::replace(self, State::NoData)
+                else {
+                    unreachable!(
+                        "this match arm guarantees that the state is SenderWaitsForReceiverToConsume"
+                    );
+                };
+                Some(data)
+            }
+            State::SenderGone(data) => Some(data.take().unwrap()),
+            State::NoData
+            | State::TryFoldFailed
+            | State::ReceiverGone
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => None,
+        }
+    }
+    fn discriminant_str(&self) -> &'static str {
+        match self {
+            State::NoData => "NoData",
+            State::HasData(_) => "HasData",
+            State::TryFoldFailed => "TryFoldFailed",
+            State::SenderWaitsForReceiverToConsume(_) => "SenderWaitsForReceiverToConsume",
+            State::SenderGone(_) => "SenderGone",
+            State::ReceiverGone => "ReceiverGone",
+            State::AllGone => "AllGone",
+            State::SenderDropping => "SenderDropping",
+            State::ReceiverDropping => "ReceiverDropping",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
+
+    #[tokio::test]
+    async fn test_send_recv() {
+        let (mut sender, mut receiver) = channel();
+
+        sender
+            .send(42, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+
+        let received = receiver.recv().await.unwrap();
+        assert_eq!(received, 42);
+    }
+
+    #[tokio::test]
+    async fn test_send_recv_with_fold() {
+        let (mut sender, mut receiver) = channel();
+
+        sender
+            .send(1, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+        sender
+            .send(2, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+
+        let received = receiver.recv().await.unwrap();
+        assert_eq!(received, 3);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_waits_for_receiver_if_try_fold_fails() {
+        let (mut sender, mut receiver) = channel();
+
+        sender.send(23, |_, _| panic!("first send")).await.unwrap();
+
+        let send_fut = sender.send(42, |_, val| Err(val));
+        let mut send_fut = std::pin::pin!(send_fut);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut send_fut => {
+                panic!("send should not complete");
+            },
+        }
+
+        let val = receiver.recv().await.unwrap();
+        assert_eq!(val, 23);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("receiver should have consumed the value");
+            },
+            _ = &mut send_fut => { },
+        }
+
+        let val = receiver.recv().await.unwrap();
+        assert_eq!(val, 42);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_errors_if_waits_for_receiver_and_receiver_drops() {
+        let (mut sender, receiver) = channel();
+
+        sender.send(23, |_, _| unreachable!()).await.unwrap();
+
+        let send_fut = sender.send(42, |_, val| Err(val));
+        let send_fut = std::pin::pin!(send_fut);
+
+        drop(receiver);
+
+        let result = send_fut.await;
+        assert!(matches!(result, Err(SendError::ReceiverGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_errors_if_waits_for_sender_and_sender_drops() {
+        let (sender, mut receiver) = channel::<()>();
+
+        let recv_fut = receiver.recv();
+        let recv_fut = std::pin::pin!(recv_fut);
+
+        drop(sender);
+
+        let result = recv_fut.await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_errors_if_waits_for_sender_and_sender_drops_with_data() {
+        let (mut sender, mut receiver) = channel();
+
+        sender.send(42, |_, _| unreachable!()).await.unwrap();
+
+        {
+            let recv_fut = receiver.recv();
+            let recv_fut = std::pin::pin!(recv_fut);
+
+            drop(sender);
+
+            let val = recv_fut.await.unwrap();
+            assert_eq!(val, 42);
+        }
+
+        let result = receiver.recv().await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_waits_for_sender_if_no_data() {
+        let (mut sender, mut receiver) = channel();
+
+        let recv_fut = receiver.recv();
+        let mut recv_fut = std::pin::pin!(recv_fut);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut recv_fut => {
+                panic!("recv should not complete");
+            },
+        }
+
+        sender.send(42, |_, _| Ok(())).await.unwrap();
+
+        let val = recv_fut.await.unwrap();
+        assert_eq!(val, 42);
+    }
+
+    #[tokio::test]
+    async fn test_receiver_gone_while_nodata() {
+        let (mut sender, receiver) = channel();
+        drop(receiver);
+
+        let result = sender.send(42, |_, _| Ok(())).await;
+        assert!(matches!(result, Err(SendError::ReceiverGone)));
+    }
+
+    #[tokio::test]
+    async fn test_sender_gone_while_nodata() {
+        let (sender, mut receiver) = super::channel::<usize>();
+        drop(sender);
+
+        let result = receiver.recv().await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_drops_after_sender_went_to_sleep() {
+        let (mut sender, receiver) = channel();
+        let state = receiver.state.clone();
+
+        sender.send(23, |_, _| unreachable!()).await.unwrap();
+
+        let send_task = tokio::spawn(async move { sender.send(42, |_, v| Err(v)).await });
+
+        tokio::time::sleep(FOREVER).await;
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::SenderWaitsForReceiverToConsume(_)
+        ));
+
+        drop(receiver);
+
+        let err = send_task
+            .await
+            .unwrap()
+            .expect_err("should unblock immediately");
+        assert!(matches!(err, SendError::ReceiverGone));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_drops_after_receiver_went_to_sleep() {
+        let (sender, mut receiver) = channel::<usize>();
+        let state = sender.state.clone();
+
+        let recv_task = tokio::spawn(async move { receiver.recv().await });
+
+        tokio::time::sleep(FOREVER).await;
+
+        assert!(matches!(&*state.value.lock().unwrap(), &State::NoData));
+
+        drop(sender);
+
+        let err = recv_task.await.unwrap().expect_err("should error");
+        assert!(matches!(err, RecvError::SenderGone));
+    }
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 2cf237e72b..1651db8500 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -188,11 +188,9 @@ pub struct PageServerConf {
     /// Optionally disable disk syncs (unsafe!)
     pub no_sync: bool,
 
-    /// Maximum amount of time for which a get page request request
-    /// might be held up for request merging.
-    pub server_side_batch_timeout: Option<Duration>,
-
     pub wal_receiver_protocol: PostgresClientProtocol,
+
+    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
 }
 
 /// Token for authentication to safekeepers
@@ -350,10 +348,10 @@ impl PageServerConf {
             concurrent_tenant_warmup,
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
-            server_side_batch_timeout,
             tenant_config,
             no_sync,
             wal_receiver_protocol,
+            page_service_pipelining,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -393,11 +391,11 @@ impl PageServerConf {
             image_compression,
             timeline_offloading,
             ephemeral_bytes_per_memory_kb,
-            server_side_batch_timeout,
             import_pgdata_upcall_api,
             import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
             import_pgdata_aws_endpoint_url,
             wal_receiver_protocol,
+            page_service_pipelining,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ef6711397a..ff6af3566c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -356,6 +356,25 @@ async fn timed<Fut: std::future::Future>(
     }
 }
 
+/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled.
+async fn timed_after_cancellation<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+    cancel: &CancellationToken,
+) -> <Fut as std::future::Future>::Output {
+    let mut fut = std::pin::pin!(fut);
+
+    tokio::select! {
+        _ = cancel.cancelled() => {
+            timed(fut, name, warn_at).await
+        }
+        ret = &mut fut => {
+            ret
+        }
+    }
+}
+
 #[cfg(test)]
 mod timed_tests {
     use super::timed;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5fd02d8749..1917e7f5b7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -7,6 +7,10 @@ use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::models::{self, TenantState};
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -16,12 +20,15 @@ use pageserver_api::models::{
     PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
-use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{
+    is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError,
+};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::borrow::Cow;
 use std::io;
+use std::num::NonZeroUsize;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -32,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::sync::spsc_fold;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
     id::{TenantId, TimelineId},
@@ -40,7 +48,6 @@ use utils::{
 };
 
 use crate::auth::check_permission;
-use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -58,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
+use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -105,7 +113,7 @@ pub fn spawn(
             pg_auth,
             tcp_listener,
             conf.pg_auth_type,
-            conf.server_side_batch_timeout,
+            conf.page_service_pipelining.clone(),
             libpq_ctx,
             cancel.clone(),
         )
@@ -154,7 +162,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
     listener_ctx: RequestContext,
     listener_cancel: CancellationToken,
 ) -> Connections {
@@ -185,7 +193,7 @@ pub async fn libpq_listener_main(
                     local_auth,
                     socket,
                     auth_type,
-                    server_side_batch_timeout,
+                    pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
                 ));
@@ -213,7 +221,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
 ) -> ConnectionHandlerResult {
@@ -256,7 +264,7 @@ async fn page_service_conn_main(
     // a while: we will tear down this PageServerHandler and instantiate a new one if/when
     // they reconnect.
     socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
-    let socket = std::pin::pin!(socket);
+    let socket = Box::pin(socket);
 
     fail::fail_point!("ps::connection-start::pre-login");
 
@@ -267,7 +275,7 @@ async fn page_service_conn_main(
     let mut conn_handler = PageServerHandler::new(
         tenant_manager,
         auth,
-        server_side_batch_timeout,
+        pipelining_config,
         connection_ctx,
         cancel.clone(),
     );
@@ -283,7 +291,7 @@ async fn page_service_conn_main(
                 info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
-                let tenant_id = conn_handler.timeline_handles.tenant_id();
+                let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
                 Err(io_error).context(format!(
                     "Postgres connection error for tenant_id={:?} client at peer_addr={}",
                     tenant_id, peer_addr
@@ -291,7 +299,7 @@ async fn page_service_conn_main(
             }
         }
         other => {
-            let tenant_id = conn_handler.timeline_handles.tenant_id();
+            let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
             other.context(format!(
                 "Postgres query error for tenant_id={:?} client peer_addr={}",
                 tenant_id, peer_addr
@@ -312,13 +320,10 @@ struct PageServerHandler {
 
     cancel: CancellationToken,
 
-    timeline_handles: TimelineHandles,
+    /// None only while pagestream protocol is being processed.
+    timeline_handles: Option<TimelineHandles>,
 
-    /// Messages queued up for the next processing batch
-    next_batch: Option<BatchedFeMessage>,
-
-    /// See [`PageServerConf::server_side_batch_timeout`]
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
 }
 
 struct TimelineHandles {
@@ -535,10 +540,12 @@ impl From<WaitLsnError> for QueryError {
 enum BatchedFeMessage {
     Exists {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
     GetPage {
@@ -549,10 +556,12 @@ enum BatchedFeMessage {
     },
     DbSize {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
     RespondError {
@@ -561,18 +570,11 @@ enum BatchedFeMessage {
     },
 }
 
-enum BatchOrEof {
-    /// In the common case, this has one entry.
-    /// At most, it has two entries: the first is the leftover batch, the second is an error.
-    Batch(smallvec::SmallVec<[BatchedFeMessage; 1]>),
-    Eof,
-}
-
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
-        server_side_batch_timeout: Option<Duration>,
+        pipelining_config: PageServicePipeliningConfig,
         connection_ctx: RequestContext,
         cancel: CancellationToken,
     ) -> Self {
@@ -580,10 +582,9 @@ impl PageServerHandler {
             auth,
             claims: None,
             connection_ctx,
-            timeline_handles: TimelineHandles::new(tenant_manager),
+            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
             cancel,
-            next_batch: None,
-            server_side_batch_timeout,
+            pipelining_config,
         }
     }
 
@@ -611,219 +612,356 @@ impl PageServerHandler {
         )
     }
 
-    async fn read_batch_from_connection<IO>(
-        &mut self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
+    async fn pagestream_read_message<IO>(
+        pgb: &mut PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        timeline_handles: &mut TimelineHandles,
+        cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<Option<BatchOrEof>, QueryError>
+        parent_span: Span,
+    ) -> Result<Option<BatchedFeMessage>, QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        let msg = tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown)
+            }
+            msg = pgb.read_message() => { msg }
+        };
+
+        let copy_data_bytes = match msg? {
+            Some(FeMessage::CopyData(bytes)) => bytes,
+            Some(FeMessage::Terminate) => {
+                return Ok(None);
+            }
+            Some(m) => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message: {m:?} during COPY"
+                )));
+            }
+            None => {
+                return Ok(None);
+            } // client disconnected
+        };
+        trace!("query: {copy_data_bytes:?}");
+
+        fail::fail_point!("ps::handle-pagerequest-message");
+
+        // parse request
+        let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
+        let batched_msg = match neon_fe_msg {
+            PagestreamFeMessage::Exists(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::Exists { span, shard, req }
+            }
+            PagestreamFeMessage::Nblocks(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::Nblocks { span, shard, req }
+            }
+            PagestreamFeMessage::DbSize(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::DbSize { span, shard, req }
+            }
+            PagestreamFeMessage::GetSlruSegment(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::GetSlruSegment { span, shard, req }
+            }
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                request_lsn,
+                not_modified_since,
+                rel,
+                blkno,
+            }) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
+
+                macro_rules! respond_error {
+                    ($error:expr) => {{
+                        let error = BatchedFeMessage::RespondError {
+                            span,
+                            error: $error,
+                        };
+                        Ok(Some(error))
+                    }};
+                }
+
+                let key = rel_block_to_key(rel, blkno);
+                let shard = match timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await
+                {
+                    Ok(tl) => tl,
+                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                        // We already know this tenant exists in general, because we resolved it at
+                        // start of connection.  Getting a NotFound here indicates that the shard containing
+                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                        // mapping is out of date.
+                        //
+                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                        // and talk to a different pageserver.
+                        return respond_error!(PageStreamError::Reconnect(
+                            "getpage@lsn request routed to wrong shard".into()
+                        ));
+                    }
+                    Err(e) => {
+                        return respond_error!(e.into());
+                    }
+                };
+                let effective_request_lsn = match Self::wait_or_get_last_lsn(
+                    &shard,
+                    request_lsn,
+                    not_modified_since,
+                    &shard.get_latest_gc_cutoff_lsn(),
+                    ctx,
+                )
+                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
+                .await
+                {
+                    Ok(lsn) => lsn,
+                    Err(e) => {
+                        return respond_error!(e);
+                    }
+                };
+                BatchedFeMessage::GetPage {
+                    span,
+                    shard,
+                    effective_request_lsn,
+                    pages: smallvec::smallvec![(rel, blkno)],
+                }
+            }
+        };
+        Ok(Some(batched_msg))
+    }
+
+    /// Post-condition: `batch` is Some()
+    #[instrument(skip_all, level = tracing::Level::TRACE)]
+    #[allow(clippy::boxed_local)]
+    fn pagestream_do_batch(
+        max_batch_size: NonZeroUsize,
+        batch: &mut Result<BatchedFeMessage, QueryError>,
+        this_msg: Result<BatchedFeMessage, QueryError>,
+    ) -> Result<(), Result<BatchedFeMessage, QueryError>> {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        let this_msg = match this_msg {
+            Ok(this_msg) => this_msg,
+            Err(e) => return Err(Err(e)),
+        };
+
+        match (&mut *batch, this_msg) {
+            // something batched already, let's see if we can add this message to the batch
+            (
+                Ok(BatchedFeMessage::GetPage {
+                    span: _,
+                    shard: accum_shard,
+                    pages: ref mut accum_pages,
+                    effective_request_lsn: accum_lsn,
+                }),
+                BatchedFeMessage::GetPage {
+                    span: _,
+                    shard: this_shard,
+                    pages: this_pages,
+                    effective_request_lsn: this_lsn,
+                },
+            ) if (|| {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+                    return false;
+                }
+                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
+                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
+                {
+                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return false;
+                }
+                // the vectored get currently only supports a single LSN, so, bounce as soon
+                // as the effective request_lsn changes
+                if *accum_lsn != this_lsn {
+                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
+                    return false;
+                }
+                true
+            })() =>
+            {
+                // ok to batch
+                accum_pages.extend(this_pages);
+                Ok(())
+            }
+            // something batched already but this message is unbatchable
+            (_, this_msg) => {
+                // by default, don't continue batching
+                Err(Ok(this_msg))
+            }
+        }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all)]
+    async fn pagesteam_handle_batched_message<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        batch: BatchedFeMessage,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        let mut batch = self.next_batch.take();
-        let mut batch_started_at: Option<std::time::Instant> = None;
-
-        let next_batch: Option<BatchedFeMessage> = loop {
-            let sleep_fut = match (self.server_side_batch_timeout, batch_started_at) {
-                (Some(batch_timeout), Some(started_at)) => futures::future::Either::Left(
-                    tokio::time::sleep_until((started_at + batch_timeout).into()),
-                ),
-                _ => futures::future::Either::Right(futures::future::pending()),
-            };
-
-            let msg = tokio::select! {
-                biased;
-                _ = self.cancel.cancelled() => {
-                    return Err(QueryError::Shutdown)
-                }
-                msg = pgb.read_message() => {
-                    msg
-                }
-                _ = sleep_fut => {
-                    assert!(batch.is_some());
-                    break None;
-                }
-            };
-            let copy_data_bytes = match msg? {
-                Some(FeMessage::CopyData(bytes)) => bytes,
-                Some(FeMessage::Terminate) => {
-                    return Ok(Some(BatchOrEof::Eof));
-                }
-                Some(m) => {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "unexpected message: {m:?} during COPY"
-                    )));
-                }
-                None => {
-                    return Ok(Some(BatchOrEof::Eof));
-                } // client disconnected
-            };
-            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");
-
-            // parse request
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
-
-            let this_msg = match neon_fe_msg {
-                PagestreamFeMessage::Exists(req) => BatchedFeMessage::Exists {
-                    span: tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::Nblocks(req) => BatchedFeMessage::Nblocks {
-                    span: tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::DbSize(req) => BatchedFeMessage::DbSize {
-                    span: tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::GetSlruSegment(req) => BatchedFeMessage::GetSlruSegment {
-                    span: tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                    request_lsn,
-                    not_modified_since,
-                    rel,
-                    blkno,
-                }) => {
-                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!(
-                        "handle_get_page_at_lsn_request_batched",
-                        %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn,
-                        batch_size = tracing::field::Empty, batch_id = tracing::field::Empty
-                    );
-
-                    macro_rules! current_batch_and_error {
-                        ($error:expr) => {{
-                            let error = BatchedFeMessage::RespondError {
-                                span,
-                                error: $error,
-                            };
-                            let batch_and_error = match batch {
-                                Some(b) => smallvec::smallvec![b, error],
-                                None => smallvec::smallvec![error],
-                            };
-                            Ok(Some(BatchOrEof::Batch(batch_and_error)))
-                        }};
-                    }
-
-                    let key = rel_block_to_key(rel, blkno);
-                    let shard = match self
-                        .timeline_handles
-                        .get(*tenant_id, *timeline_id, ShardSelector::Page(key))
-                        .instrument(span.clone())
-                        .await
-                    {
-                        Ok(tl) => tl,
-                        Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                            // We already know this tenant exists in general, because we resolved it at
-                            // start of connection.  Getting a NotFound here indicates that the shard containing
-                            // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                            // mapping is out of date.
-                            //
-                            // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                            // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                            // and talk to a different pageserver.
-                            return current_batch_and_error!(PageStreamError::Reconnect(
-                                "getpage@lsn request routed to wrong shard".into()
-                            ));
-                        }
-                        Err(e) => {
-                            return current_batch_and_error!(e.into());
-                        }
-                    };
-                    let effective_request_lsn = match Self::wait_or_get_last_lsn(
-                        &shard,
-                        request_lsn,
-                        not_modified_since,
-                        &shard.get_latest_gc_cutoff_lsn(),
-                        ctx,
-                    )
-                    // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                    .await
-                    {
-                        Ok(lsn) => lsn,
-                        Err(e) => {
-                            return current_batch_and_error!(e);
-                        }
-                    };
-                    BatchedFeMessage::GetPage {
+        // invoke handler function
+        let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
+            match batch {
+                BatchedFeMessage::Exists { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
+                    (
+                        vec![
+                            self.handle_get_rel_exists_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
                         span,
-                        shard,
-                        effective_request_lsn,
-                        pages: smallvec::smallvec![(rel, blkno)],
-                    }
+                    )
+                }
+                BatchedFeMessage::Nblocks { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                    (
+                        vec![
+                            self.handle_get_nblocks_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::GetPage {
+                    span,
+                    shard,
+                    effective_request_lsn,
+                    pages,
+                } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                    (
+                        {
+                            let npages = pages.len();
+                            trace!(npages, "handling getpage request");
+                            let res = self
+                                .handle_get_page_at_lsn_request_batched(
+                                    &shard,
+                                    effective_request_lsn,
+                                    pages,
+                                    ctx,
+                                )
+                                .instrument(span.clone())
+                                .await;
+                            assert_eq!(res.len(), npages);
+                            res
+                        },
+                        span,
+                    )
+                }
+                BatchedFeMessage::DbSize { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                    (
+                        vec![
+                            self.handle_db_size_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::GetSlruSegment { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                    (
+                        vec![
+                            self.handle_get_slru_segment_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::RespondError { span, error } => {
+                    // We've already decided to respond with an error, so we don't need to
+                    // call the handler.
+                    (vec![Err(error)], span)
                 }
             };
 
-            let batch_timeout = match self.server_side_batch_timeout {
-                Some(value) => value,
-                None => {
-                    // Batching is not enabled - stop on the first message.
-                    return Ok(Some(BatchOrEof::Batch(smallvec::smallvec![this_msg])));
-                }
+        // Map handler result to protocol behavior.
+        // Some handler errors cause exit from pagestream protocol.
+        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        for handler_result in handler_results {
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    }
+                },
+                Ok(response_msg) => response_msg,
             };
 
-            // check if we can batch
-            match (&mut batch, this_msg) {
-                (None, this_msg) => {
-                    batch = Some(this_msg);
-                }
-                (
-                    Some(BatchedFeMessage::GetPage {
-                        span: _,
-                        shard: accum_shard,
-                        pages: accum_pages,
-                        effective_request_lsn: accum_lsn,
-                    }),
-                    BatchedFeMessage::GetPage {
-                        span: _,
-                        shard: this_shard,
-                        pages: this_pages,
-                        effective_request_lsn: this_lsn,
-                    },
-                ) if async {
-                    assert_eq!(this_pages.len(), 1);
-                    if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
-                        assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
-                        return false;
-                    }
-                    if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                        != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                    {
-                        // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                        // But the current logic for keeping responses in order does not support that.
-                        return false;
-                    }
-                    // the vectored get currently only supports a single LSN, so, bounce as soon
-                    // as the effective request_lsn changes
-                    if *accum_lsn != this_lsn {
-                        return false;
-                    }
-                    true
-                }
-                .await =>
-                {
-                    // ok to batch
-                    accum_pages.extend(this_pages);
-                }
-                (Some(_), this_msg) => {
-                    // by default, don't continue batching
-                    break Some(this_msg);
-                }
+            // marshal & transmit response message
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+        }
+        tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                // We were requested to shut down.
+                info!("shutdown request received in page handler");
+                return Err(QueryError::Shutdown)
             }
-
-            // batching impl piece
-            let started_at = batch_started_at.get_or_insert_with(Instant::now);
-            if started_at.elapsed() > batch_timeout {
-                break None;
+            res = pgb_writer.flush() => {
+                res?;
             }
-        };
-
-        self.next_batch = next_batch;
-        Ok(batch.map(|b| BatchOrEof::Batch(smallvec::smallvec![b])))
+        }
+        Ok(())
     }
 
     /// Pagestream sub-protocol handler.
@@ -845,7 +983,7 @@ impl PageServerHandler {
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
@@ -861,169 +999,283 @@ impl PageServerHandler {
             }
         }
 
-        // If [`PageServerHandler`] is reused for multiple pagestreams,
-        // then make sure to not process requests from the previous ones.
-        self.next_batch = None;
+        let pgb_reader = pgb
+            .split()
+            .context("implementation error: split pgb into reader and writer")?;
 
-        loop {
-            let maybe_batched = self
-                .read_batch_from_connection(pgb, &tenant_id, &timeline_id, &ctx)
-                .await?;
-            let batched = match maybe_batched {
-                Some(BatchOrEof::Batch(b)) => b,
-                Some(BatchOrEof::Eof) => {
-                    break;
-                }
+        let timeline_handles = self
+            .timeline_handles
+            .take()
+            .expect("implementation error: timeline_handles should not be locked");
+
+        let request_span = info_span!("request", shard_id = tracing::field::Empty);
+        let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
+            PageServicePipeliningConfig::Pipelined(pipelining_config) => {
+                self.handle_pagerequests_pipelined(
+                    pgb,
+                    pgb_reader,
+                    tenant_id,
+                    timeline_id,
+                    timeline_handles,
+                    request_span,
+                    pipelining_config,
+                    &ctx,
+                )
+                .await
+            }
+            PageServicePipeliningConfig::Serial => {
+                self.handle_pagerequests_serial(
+                    pgb,
+                    pgb_reader,
+                    tenant_id,
+                    timeline_id,
+                    timeline_handles,
+                    request_span,
+                    &ctx,
+                )
+                .await
+            }
+        };
+
+        debug!("pagestream subprotocol shut down cleanly");
+
+        pgb.unsplit(pgb_reader)
+            .context("implementation error: unsplit pgb")?;
+
+        let replaced = self.timeline_handles.replace(timeline_handles);
+        assert!(replaced.is_none());
+
+        result
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn handle_pagerequests_serial<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        mut pgb_reader: PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        mut timeline_handles: TimelineHandles,
+        request_span: Span,
+        ctx: &RequestContext,
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        let cancel = self.cancel.clone();
+        let err = loop {
+            let msg = Self::pagestream_read_message(
+                &mut pgb_reader,
+                tenant_id,
+                timeline_id,
+                &mut timeline_handles,
+                &cancel,
+                ctx,
+                request_span.clone(),
+            )
+            .await;
+            let msg = match msg {
+                Ok(msg) => msg,
+                Err(e) => break e,
+            };
+            let msg = match msg {
+                Some(msg) => msg,
                 None => {
-                    continue;
+                    debug!("pagestream subprotocol end observed");
+                    return ((pgb_reader, timeline_handles), Ok(()));
                 }
             };
+            let err = self
+                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
+                .await;
+            match err {
+                Ok(()) => {}
+                Err(e) => break e,
+            }
+        };
+        ((pgb_reader, timeline_handles), Err(err))
+    }
 
-            for batch in batched {
-                // invoke handler function
-                let (handler_results, span): (
-                    Vec<Result<PagestreamBeMessage, PageStreamError>>,
-                    _,
-                ) = match batch {
-                    BatchedFeMessage::Exists { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::exists");
-                        (
-                            vec![
-                                self.handle_get_rel_exists_request(
-                                    tenant_id,
-                                    timeline_id,
-                                    &req,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::Nblocks { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::nblocks");
-                        (
-                            vec![
-                                self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
-                                    .instrument(span.clone())
-                                    .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::GetPage {
-                        span,
-                        shard,
-                        effective_request_lsn,
-                        pages,
-                    } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::getpage");
-                        (
-                            {
-                                let npages = pages.len();
-                                let res = self
-                                    .handle_get_page_at_lsn_request_batched(
-                                        &shard,
-                                        effective_request_lsn,
-                                        pages,
-                                        &ctx,
-                                    )
-                                    .instrument(span.clone())
-                                    .await;
-                                assert_eq!(res.len(), npages);
-                                res
-                            },
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::DbSize { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::dbsize");
-                        (
-                            vec![
-                                self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
-                                    .instrument(span.clone())
-                                    .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::GetSlruSegment { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
-                        (
-                            vec![
-                                self.handle_get_slru_segment_request(
-                                    tenant_id,
-                                    timeline_id,
-                                    &req,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::RespondError { span, error } => {
-                        // We've already decided to respond with an error, so we don't need to
-                        // call the handler.
-                        (vec![Err(error)], span)
-                    }
-                };
+    /// # Cancel-Safety
+    ///
+    /// May leak tokio tasks if not polled to completion.
+    #[allow(clippy::too_many_arguments)]
+    async fn handle_pagerequests_pipelined<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        pgb_reader: PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        mut timeline_handles: TimelineHandles,
+        request_span: Span,
+        pipelining_config: PageServicePipeliningConfigPipelined,
+        ctx: &RequestContext,
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        //
+        // Pipelined pagestream handling consists of
+        // - a Batcher that reads requests off the wire and
+        //   and batches them if possible,
+        // - an Executor that processes the batched requests.
+        //
+        // The batch is built up inside an `spsc_fold` channel,
+        // shared betwen Batcher (Sender) and Executor (Receiver).
+        //
+        // The Batcher continously folds client requests into the batch,
+        // while the Executor can at any time take out what's in the batch
+        // in order to process it.
+        // This means the next batch builds up while the Executor
+        // executes the last batch.
+        //
+        // CANCELLATION
+        //
+        // We run both Batcher and Executor futures to completion before
+        // returning from this function.
+        //
+        // If Executor exits first, it signals cancellation to the Batcher
+        // via a CancellationToken that is child of `self.cancel`.
+        // If Batcher exits first, it signals cancellation to the Executor
+        // by dropping the spsc_fold channel Sender.
+        //
+        // CLEAN SHUTDOWN
+        //
+        // Clean shutdown means that the client ends the COPYBOTH session.
+        // In response to such a client message, the Batcher exits.
+        // The Executor continues to run, draining the spsc_fold channel.
+        // Once drained, the spsc_fold recv will fail with a distinct error
+        // indicating that the sender disconnected.
+        // The Executor exits with Ok(()) in response to that error.
+        //
+        // Server initiated shutdown is not clean shutdown, but instead
+        // is an error Err(QueryError::Shutdown) that is propagated through
+        // error propagation.
+        //
+        // ERROR PROPAGATION
+        //
+        // When the Batcher encounter an error, it sends it as a value
+        // through the spsc_fold channel and exits afterwards.
+        // When the Executor observes such an error in the channel,
+        // it exits returning that error value.
+        //
+        // This design ensures that the Executor stage will still process
+        // the batch that was in flight when the Batcher encountered an error,
+        // thereby beahving identical to a serial implementation.
 
-                // Map handler result to protocol behavior.
-                // Some handler errors cause exit from pagestream protocol.
-                // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-                for handler_result in handler_results {
-                    let response_msg = match handler_result {
-                        Err(e) => match &e {
-                            PageStreamError::Shutdown => {
-                                // If we fail to fulfil a request during shutdown, which may be _because_ of
-                                // shutdown, then do not send the error to the client.  Instead just drop the
-                                // connection.
-                                span.in_scope(|| info!("dropping connection due to shutdown"));
-                                return Err(QueryError::Shutdown);
-                            }
-                            PageStreamError::Reconnect(reason) => {
-                                span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                                return Err(QueryError::Reconnect);
-                            }
-                            PageStreamError::Read(_)
-                            | PageStreamError::LsnTimeout(_)
-                            | PageStreamError::NotFound(_)
-                            | PageStreamError::BadRequest(_) => {
-                                // print the all details to the log with {:#}, but for the client the
-                                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                                // here includes cancellation which is not an error.
-                                let full = utils::error::report_compact_sources(&e);
-                                span.in_scope(|| {
-                                    error!("error reading relation or page version: {full:#}")
-                                });
-                                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                    message: e.to_string(),
-                                })
-                            }
-                        },
-                        Ok(response_msg) => response_msg,
-                    };
+        let PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            execution,
+        } = pipelining_config;
 
-                    // marshal & transmit response message
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+        // Macro to _define_ a pipeline stage.
+        macro_rules! pipeline_stage {
+            ($name:literal, $cancel:expr, $make_fut:expr) => {{
+                let cancel: CancellationToken = $cancel;
+                let stage_fut = $make_fut(cancel.clone());
+                async move {
+                    scopeguard::defer! {
+                        debug!("exiting");
+                    }
+                    timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel)
+                        .await
                 }
-                tokio::select! {
-                    biased;
-                    _ = self.cancel.cancelled() => {
-                        // We were requested to shut down.
-                        info!("shutdown request received in page handler");
-                        return Err(QueryError::Shutdown)
-                    }
-                    res = pgb.flush() => {
-                        res?;
-                    }
+                .instrument(tracing::info_span!($name))
+            }};
+        }
+
+        //
+        // Batcher
+        //
+
+        let cancel_batcher = self.cancel.child_token();
+        let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
+        let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
+            let ctx = ctx.attached_child();
+            async move {
+                let mut pgb_reader = pgb_reader;
+                let mut exit = false;
+                while !exit {
+                    let read_res = Self::pagestream_read_message(
+                        &mut pgb_reader,
+                        tenant_id,
+                        timeline_id,
+                        &mut timeline_handles,
+                        &cancel_batcher,
+                        &ctx,
+                        request_span.clone(),
+                    )
+                    .await;
+                    let Some(read_res) = read_res.transpose() else {
+                        debug!("client-initiated shutdown");
+                        break;
+                    };
+                    exit |= read_res.is_err();
+                    let could_send = batch_tx
+                        .send(read_res, |batch, res| {
+                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                        })
+                        .await;
+                    exit |= could_send.is_err();
+                }
+                (pgb_reader, timeline_handles)
+            }
+        });
+
+        //
+        // Executor
+        //
+
+        let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| {
+            let ctx = ctx.attached_child();
+            async move {
+                let _cancel_batcher = cancel_batcher.drop_guard();
+                loop {
+                    let maybe_batch = batch_rx.recv().await;
+                    let batch = match maybe_batch {
+                        Ok(batch) => batch,
+                        Err(spsc_fold::RecvError::SenderGone) => {
+                            debug!("upstream gone");
+                            return Ok(());
+                        }
+                    };
+                    let batch = match batch {
+                        Ok(batch) => batch,
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
+                        .await?;
                 }
             }
+        });
+
+        //
+        // Execute the stages.
+        //
+
+        match execution {
+            PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                tokio::join!(batcher, executor)
+            }
+            PageServiceProtocolPipelinedExecutionStrategy::Tasks => {
+                // These tasks are not tracked anywhere.
+                let read_messages_task = tokio::spawn(batcher);
+                let (read_messages_task_res, executor_res_) =
+                    tokio::join!(read_messages_task, executor,);
+                (
+                    read_messages_task_res.expect("propagated panic from read_messages"),
+                    executor_res_,
+                )
+            }
         }
-        Ok(())
     }
 
     /// Helper function to handle the LSN from client request.
@@ -1131,6 +1383,8 @@ impl PageServerHandler {
     {
         let timeline = self
             .timeline_handles
+            .as_mut()
+            .unwrap()
             .get(
                 tenant_shard_id.tenant_id,
                 timeline_id,
@@ -1165,22 +1419,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1200,23 +1449,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_nblocks_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1236,23 +1479,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_db_size_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1300,23 +1537,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_slru_segment_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1374,6 +1605,8 @@ impl PageServerHandler {
 
         let timeline = self
             .timeline_handles
+            .as_mut()
+            .unwrap()
             .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
 
@@ -1716,7 +1949,7 @@ impl PageServiceCmd {
 
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
-    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
 {
     fn check_auth_jwt(
         &mut self,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e3c88e9965..9bcfffeb9c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3804,9 +3804,10 @@ class Endpoint(PgProtocol, LogUtils):
             # shared_buffers = 512kB to make postgres use LFC intensively
             # neon.max_file_cache_size and neon.file_cache size limit are
             # set to 1MB because small LFC is better for testing (helps to find more problems)
+            lfc_path_escaped = str(lfc_path).replace("'", "''")
             config_lines = [
                 "shared_buffers = 512kB",
-                f"neon.file_cache_path = '{self.lfc_path()}'",
+                f"neon.file_cache_path = '{lfc_path_escaped}'",
                 "neon.max_file_cache_size = 1MB",
                 "neon.file_cache_size_limit = 1MB",
             ] + config_lines
diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_page_service_batching.py
similarity index 69%
rename from test_runner/performance/pageserver/test_pageserver_getpage_merge.py
rename to test_runner/performance/pageserver/test_page_service_batching.py
index 34cce9900b..c47a849fec 100644
--- a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -11,36 +11,95 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
 from fixtures.utils import humantime_to_ms
 
-TARGET_RUNTIME = 60
+TARGET_RUNTIME = 30
+
+
+@dataclass
+class PageServicePipeliningConfig:
+    pass
+
+
+@dataclass
+class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
+    mode: str = "serial"
+
+
+@dataclass
+class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
+    max_batch_size: int
+    execution: str
+    mode: str = "pipelined"
+
+
+EXECUTION = ["concurrent-futures", "tasks"]
+
+NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 32]:
+    for execution in EXECUTION:
+        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+
+BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 2, 4, 8, 16, 32]:
+    for execution in EXECUTION:
+        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
 @pytest.mark.parametrize(
-    "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
     [
-        # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
-        (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
-        (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
-        (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
-        # the next 4 cases demonstrate how batchable workloads benefit from batching
-        (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
-        (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
-        (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
-        (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
+        # non-batchable workloads
+        # (A separate benchmark will consider latency).
+        *[
+            (
+                50,
+                config,
+                TARGET_RUNTIME,
+                1,
+                128,
+                f"not batchable {dataclasses.asdict(config)}",
+            )
+            for config in NON_BATCHABLE
+        ],
+        # batchable workloads should show throughput and CPU efficiency improvements
+        *[
+            (
+                50,
+                config,
+                TARGET_RUNTIME,
+                100,
+                128,
+                f"batchable {dataclasses.asdict(config)}",
+            )
+            for config in BATCHABLE
+        ],
     ],
 )
-def test_getpage_merge_smoke(
+def test_throughput(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     tablesize_mib: int,
-    batch_timeout: str | None,
+    pipelining_config: PageServicePipeliningConfig,
     target_runtime: int,
     effective_io_concurrency: int,
     readhead_buffer_size: int,
     name: str,
 ):
     """
-    Do a bunch of sequential scans and ensure that the pageserver does some merging.
+    Do a bunch of sequential scans with varying compute and pipelining configurations.
+    Primary performance metrics are the achieved batching factor and throughput (wall clock time).
+    Resource utilization is also interesting - we currently measure CPU time.
+
+    The test is a fixed-runtime based type of test (target_runtime).
+    Hence, the results are normalized to the number of iterations completed within target runtime.
+
+    If the compute doesn't provide pipeline depth (effective_io_concurrency=1),
+    performance should be about identical in all configurations.
+    Pipelining can still yield improvements in these scenarios because it parses the
+    next request while the current one is still being executed.
+
+    If the compute provides pipeline depth (effective_io_concurrency=100), then
+    pipelining configs, especially with max_batch_size>1 should yield dramatic improvements
+    in all performance metrics.
     """
 
     #
@@ -51,14 +110,16 @@ def test_getpage_merge_smoke(
     params.update(
         {
             "tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
-            "batch_timeout": (
-                -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
-                {"unit": "us"},
-            ),
             # target_runtime is just a polite ask to the workload to run for this long
             "effective_io_concurrency": (effective_io_concurrency, {}),
             "readhead_buffer_size": (readhead_buffer_size, {}),
-            # name is not a metric
+            # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
+        }
+    )
+    params.update(
+        {
+            f"pipelining_config.{k}": (v, {})
+            for k, v in dataclasses.asdict(pipelining_config).items()
         }
     )
 
@@ -170,7 +231,9 @@ def test_getpage_merge_smoke(
         after = get_metrics()
         return (after - before).normalize(iters - 1)
 
-    env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
+    env.pageserver.patch_config_toml_nonrecursive(
+        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
+    )
     env.pageserver.restart()
     metrics = workload()
 
@@ -199,23 +262,30 @@ def test_getpage_merge_smoke(
     )
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 32]:
+    for execution in EXECUTION:
+        PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+
+
 @pytest.mark.parametrize(
-    "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
+    "pipelining_config,name",
+    [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS],
 )
-def test_timer_precision(
+def test_latency(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
-    batch_timeout: str | None,
+    pipelining_config: PageServicePipeliningConfig,
+    name: str,
 ):
     """
-    Determine the batching timeout precision (mean latency) and tail latency impact.
+    Measure the latency impact of pipelining in an un-batchable workloads.
 
-    The baseline is `None`; an ideal batching timeout implementation would increase
-    the mean latency by exactly `batch_timeout`.
+    An ideal implementation should not increase average or tail latencies for such workloads.
 
-    That is not the case with the current implementation, will be addressed in future changes.
+    We don't have support in pagebench to create queue depth yet.
+    => https://github.com/neondatabase/neon/issues/9837
     """
 
     #
@@ -223,7 +293,8 @@ def test_timer_precision(
     #
 
     def patch_ps_config(ps_config):
-        ps_config["server_side_batch_timeout"] = batch_timeout
+        if pipelining_config is not None:
+            ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
 
     neon_env_builder.pageserver_config_override = patch_ps_config
 

From 4abc8e5282037c85a922ae113e0677c50841b309 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Sat, 30 Nov 2024 11:11:37 +0100
Subject: [PATCH 017/117] Merge the consumption metric pushes (#9939)

#8564

## Problem

The main and backup consumption metric pushes are completely
independent,
resulting in different event time windows and different idempotency
keys.

## Summary of changes

* Merge the push tasks, but keep chunks the same size.
---
 Cargo.lock                          |   1 +
 libs/consumption_metrics/src/lib.rs |   5 +-
 proxy/Cargo.toml                    |   1 +
 proxy/src/bin/proxy.rs              |   4 -
 proxy/src/usage_metrics.rs          | 351 ++++++++++++++--------------
 5 files changed, 181 insertions(+), 181 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 313222cf3c..5ce27a7d45 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4501,6 +4501,7 @@ dependencies = [
  "ecdsa 0.16.9",
  "env_logger",
  "fallible-iterator",
+ "flate2",
  "framed-websockets",
  "futures",
  "hashbrown 0.14.5",
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index fbe2e6830f..448134f31a 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -103,11 +103,12 @@ impl<'a> IdempotencyKey<'a> {
     }
 }
 
+/// Split into chunks of 1000 metrics to avoid exceeding the max request size.
 pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, Deserialize)]
-pub struct EventChunk<'a, T: Clone> {
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)]
+pub struct EventChunk<'a, T: Clone + PartialEq> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0d774d529d..f5934c8a89 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -112,6 +112,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+flate2.workspace = true
 tokio-tungstenite.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index a935378162..b772a987ee 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -517,10 +517,6 @@ async fn main() -> anyhow::Result<()> {
     if let Some(metrics_config) = &config.metric_collection {
         // TODO: Add gc regardles of the metric collection being enabled.
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
-        client_tasks.spawn(usage_metrics::task_backup(
-            &metrics_config.backup_metric_collection_config,
-            cancellation_token.clone(),
-        ));
     }
 
     if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index c5e8588623..65e74466f2 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,19 +1,18 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
+use std::borrow::Cow;
 use std::convert::Infallible;
-use std::pin::pin;
 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::mapref::entry::Entry;
 use dashmap::DashMap;
-use futures::future::select;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -23,7 +22,7 @@ use tracing::{error, info, instrument, trace, warn};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};
 
-use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig};
+use crate::config::MetricCollectionConfig;
 use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD};
 use crate::http;
 use crate::intern::{BranchIdInt, EndpointIdInt};
@@ -58,55 +57,21 @@ trait MetricCounterReporter {
     fn move_metrics(&self) -> (u64, usize);
 }
 
-#[derive(Debug)]
-struct MetricBackupCounter {
-    transmitted: AtomicU64,
-    opened_connections: AtomicUsize,
-}
-
-impl MetricCounterRecorder for MetricBackupCounter {
-    fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-    }
-
-    fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-    }
-}
-
-impl MetricCounterReporter for MetricBackupCounter {
-    fn get_metrics(&mut self) -> (u64, usize) {
-        (
-            *self.transmitted.get_mut(),
-            *self.opened_connections.get_mut(),
-        )
-    }
-    fn move_metrics(&self) -> (u64, usize) {
-        (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
-        )
-    }
-}
-
 #[derive(Debug)]
 pub(crate) struct MetricCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
-    backup: Arc<MetricBackupCounter>,
 }
 
 impl MetricCounterRecorder for MetricCounter {
     /// Record that some bytes were sent from the proxy to the client
     fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-        self.backup.record_egress(bytes);
+        self.transmitted.fetch_add(bytes, Ordering::Relaxed);
     }
 
     /// Record that some connections were opened
     fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-        self.backup.record_connection(count);
+        self.opened_connections.fetch_add(count, Ordering::Relaxed);
     }
 }
 
@@ -119,8 +84,8 @@ impl MetricCounterReporter for MetricCounter {
     }
     fn move_metrics(&self) -> (u64, usize) {
         (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
+            self.transmitted.swap(0, Ordering::Relaxed),
+            self.opened_connections.swap(0, Ordering::Relaxed),
         )
     }
 }
@@ -173,26 +138,11 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub(crate) struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
-    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
     pub(crate) fn register(&self, ids: Ids) -> Arc<MetricCounter> {
-        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
-            entry.clone()
-        } else {
-            self.backup_endpoints
-                .entry(ids.clone())
-                .or_insert_with(|| {
-                    Arc::new(MetricBackupCounter {
-                        transmitted: AtomicU64::new(0),
-                        opened_connections: AtomicUsize::new(0),
-                    })
-                })
-                .clone()
-        };
-
         let entry = if let Some(entry) = self.endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -202,7 +152,6 @@ impl Metrics {
                     Arc::new(MetricCounter {
                         transmitted: AtomicU64::new(0),
                         opened_connections: AtomicUsize::new(0),
-                        backup: backup.clone(),
                     })
                 })
                 .clone()
@@ -227,6 +176,21 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     );
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    // Even if the remote storage is not configured, we still want to clear the metrics.
+    let storage = if let Some(config) = config
+        .backup_metric_collection_config
+        .remote_storage_config
+        .as_ref()
+    {
+        Some(
+            GenericRemoteStorage::from_config(config)
+                .await
+                .context("remote storage init")?,
+        )
+    } else {
+        None
+    };
+
     let mut prev = Utc::now();
     let mut ticker = tokio::time::interval(config.interval);
     loop {
@@ -237,6 +201,8 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
             &USAGE_METRICS.endpoints,
             &http_client,
             &config.endpoint,
+            storage.as_ref(),
+            config.backup_metric_collection_config.chunk_size,
             &hostname,
             prev,
             now,
@@ -283,7 +249,6 @@ fn create_event_chunks<'a>(
     now: DateTime<Utc>,
     chunk_size: usize,
 ) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
     metrics_to_send
         .chunks(chunk_size)
         .map(move |chunk| EventChunk {
@@ -303,11 +268,14 @@ fn create_event_chunks<'a>(
         })
 }
 
+#[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
     endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
     client: &http::ClientWithMiddleware,
     metric_collection_endpoint: &reqwest::Url,
+    storage: Option<&GenericRemoteStorage>,
+    outer_chunk_size: usize,
     hostname: &str,
     prev: DateTime<Utc>,
     now: DateTime<Utc>,
@@ -323,17 +291,54 @@ async fn collect_metrics_iteration(
         trace!("no new metrics to send");
     }
 
+    let cancel = CancellationToken::new();
+    let path_prefix = create_remote_path_prefix(now);
+
     // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, outer_chunk_size) {
+        tokio::join!(
+            upload_main_events_chunked(client, metric_collection_endpoint, &chunk, CHUNK_SIZE),
+            async {
+                if let Err(e) = upload_backup_events(storage, &chunk, &path_prefix, &cancel).await {
+                    error!("failed to upload consumption events to remote storage: {e:?}");
+                }
+            }
+        );
+    }
+}
+
+fn create_remote_path_prefix(now: DateTime<Utc>) -> String {
+    format!(
+        "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z",
+        year = now.year(),
+        month = now.month(),
+        day = now.day(),
+        hour = now.hour(),
+        minute = now.minute(),
+        second = now.second(),
+    )
+}
+
+async fn upload_main_events_chunked(
+    client: &http::ClientWithMiddleware,
+    metric_collection_endpoint: &reqwest::Url,
+    chunk: &EventChunk<'_, Event<Ids, &str>>,
+    subchunk_size: usize,
+) {
+    // Split into smaller chunks to avoid exceeding the max request size
+    for subchunk in chunk.events.chunks(subchunk_size).map(|c| EventChunk {
+        events: Cow::Borrowed(c),
+    }) {
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&chunk)
+            .json(&subchunk)
             .send()
             .await;
 
         let res = match res {
             Ok(x) => x,
             Err(err) => {
+                // TODO: retry?
                 error!("failed to send metrics: {:?}", err);
                 continue;
             }
@@ -341,7 +346,7 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
+            for metric in subchunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 warn!("potentially abnormal metric value: {:?}", metric);
             }
@@ -349,113 +354,34 @@ async fn collect_metrics_iteration(
     }
 }
 
-pub async fn task_backup(
-    backup_config: &MetricBackupCollectionConfig,
-    cancellation_token: CancellationToken,
-) -> anyhow::Result<()> {
-    info!("metrics backup config: {backup_config:?}");
-    scopeguard::defer! {
-        info!("metrics backup has shut down");
-    }
-    // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
-        Some(
-            GenericRemoteStorage::from_config(config)
-                .await
-                .context("remote storage init")?,
-        )
-    } else {
-        None
-    };
-    let mut ticker = tokio::time::interval(backup_config.interval);
-    let mut prev = Utc::now();
-    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
-    loop {
-        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
-        let now = Utc::now();
-        collect_metrics_backup_iteration(
-            &USAGE_METRICS.backup_endpoints,
-            storage.as_ref(),
-            &hostname,
-            prev,
-            now,
-            backup_config.chunk_size,
-        )
-        .await;
-
-        prev = now;
-        if cancellation_token.is_cancelled() {
-            info!("metrics backup has been cancelled");
-            break;
-        }
-    }
-    Ok(())
-}
-
-#[instrument(skip_all)]
-async fn collect_metrics_backup_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
+async fn upload_backup_events(
     storage: Option<&GenericRemoteStorage>,
-    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-    chunk_size: usize,
-) {
-    let year = now.year();
-    let month = now.month();
-    let day = now.day();
-    let hour = now.hour();
-    let minute = now.minute();
-    let second = now.second();
-    let cancel = CancellationToken::new();
-
-    info!("starting collect_metrics_backup_iteration");
-
-    let metrics_to_send = collect_and_clear_metrics(endpoints);
-
-    if metrics_to_send.is_empty() {
-        trace!("no new metrics to send");
-    }
-
-    // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
-        let real_now = Utc::now();
-        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
-            NoContext,
-            real_now.second().into(),
-            real_now.nanosecond(),
-        ));
-        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
-        let remote_path = match RemotePath::from_string(&path) {
-            Ok(remote_path) => remote_path,
-            Err(e) => {
-                error!("failed to create remote path from str {path}: {:?}", e);
-                continue;
-            }
-        };
-
-        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
-
-        if let Err(e) = res {
-            error!(
-                "failed to upload consumption events to remote storage: {:?}",
-                e
-            );
-        }
-    }
-}
-
-async fn upload_events_chunk(
-    storage: Option<&GenericRemoteStorage>,
-    chunk: EventChunk<'_, Event<Ids, &'static str>>,
-    remote_path: &RemotePath,
+    chunk: &EventChunk<'_, Event<Ids, &'static str>>,
+    path_prefix: &str,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     let Some(storage) = storage else {
-        error!("no remote storage configured");
+        warn!("no remote storage configured");
         return Ok(());
     };
-    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
+
+    let real_now = Utc::now();
+    let id = uuid::Uuid::new_v7(Timestamp::from_unix(
+        NoContext,
+        real_now.second().into(),
+        real_now.nanosecond(),
+    ));
+    let path = format!("{path_prefix}_{id}.json.gz");
+    let remote_path = match RemotePath::from_string(&path) {
+        Ok(remote_path) => remote_path,
+        Err(e) => {
+            bail!("failed to create remote path from str {path}: {:?}", e);
+        }
+    };
+
+    // TODO: This is async compression from Vec to Vec. Rewrite as byte stream.
+    //       Use sync compression in blocking threadpool.
+    let data = serde_json::to_vec(chunk).context("serialize metrics")?;
     let mut encoder = GzipEncoder::new(Vec::new());
     encoder.write_all(&data).await.context("compress metrics")?;
     encoder.shutdown().await.context("compress metrics")?;
@@ -464,7 +390,7 @@ async fn upload_events_chunk(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
             storage
-                .upload(stream, compressed_data.len(), remote_path, None, cancel)
+                .upload(stream, compressed_data.len(), &remote_path, None, cancel)
                 .await
         },
         TimeoutOrCancel::caused_by_cancel,
@@ -482,9 +408,12 @@ async fn upload_events_chunk(
 
 #[cfg(test)]
 mod tests {
+    use std::fs;
+    use std::io::BufReader;
     use std::sync::{Arc, Mutex};
 
     use anyhow::Error;
+    use camino_tempfile::tempdir;
     use chrono::Utc;
     use consumption_metrics::{Event, EventChunk};
     use http_body_util::BodyExt;
@@ -493,6 +422,7 @@ mod tests {
     use hyper::service::service_fn;
     use hyper::{Request, Response};
     use hyper_util::rt::TokioIo;
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use tokio::net::TcpListener;
     use url::Url;
 
@@ -538,8 +468,34 @@ mod tests {
         let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
         let now = Utc::now();
 
+        let storage_test_dir = tempdir().unwrap();
+        let local_fs_path = storage_test_dir.path().join("usage_metrics");
+        fs::create_dir_all(&local_fs_path).unwrap();
+        let storage = GenericRemoteStorage::from_config(&RemoteStorageConfig {
+            storage: RemoteStorageKind::LocalFs {
+                local_path: local_fs_path.clone(),
+            },
+            timeout: Duration::from_secs(10),
+            small_timeout: Duration::from_secs(1),
+        })
+        .await
+        .unwrap();
+
+        let mut pushed_chunks: Vec<Report> = Vec::new();
+        let mut stored_chunks: Vec<Report> = Vec::new();
+
         // no counters have been registered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
@@ -551,39 +507,84 @@ mod tests {
         });
 
         // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 0);
+        pushed_chunks.extend(r);
 
         // record egress
         counter.record_egress(1);
 
         // egress should be observered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 1);
+        pushed_chunks.extend(r);
 
         // release counter
         drop(counter);
 
         // we do not observe the counter
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
 
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
-            .await;
-        assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
-            .await;
-        // backup counter is unregistered after the second iteration
-        assert!(metrics.backup_endpoints.is_empty());
+        let path_prefix = create_remote_path_prefix(now);
+        for entry in walkdir::WalkDir::new(&local_fs_path)
+            .into_iter()
+            .filter_map(|e| e.ok())
+        {
+            let path = local_fs_path.join(&path_prefix).to_string();
+            if entry.path().to_str().unwrap().starts_with(&path) {
+                let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new(
+                    BufReader::new(fs::File::open(entry.into_path()).unwrap()),
+                ))
+                .unwrap();
+                stored_chunks.push(chunk);
+            }
+        }
+        storage_test_dir.close().ok();
+
+        // sort by first event's idempotency key because the order of files is nondeterministic
+        pushed_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone());
+        stored_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone());
+        assert_eq!(pushed_chunks, stored_chunks);
     }
 }

From 97a9abd18131708d0daadfd9c43b95048b538910 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 1 Dec 2024 14:23:10 +0200
Subject: [PATCH 018/117] Add GUC controlling whether to pause recovery if some
 critical GUCs at replica have smaller value than on primary (#9057)

## Problem

See https://github.com/neondatabase/neon/issues/9023

## Summary of changes

Ass GUC `recovery_pause_on_misconfig` allowing not to pause in case of
replica and primary configuration mismatch

See https://github.com/neondatabase/postgres/pull/501
See https://github.com/neondatabase/postgres/pull/502
See https://github.com/neondatabase/postgres/pull/503
See https://github.com/neondatabase/postgres/pull/504


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/neon.c                              |  13 ++
 .../regress/test_physical_replication.py      | 221 +++++++++++++++++-
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 7 files changed, 241 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 51b9f58bbc..ff08f9164d 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -15,6 +15,9 @@
 #include "access/subtrans.h"
 #include "access/twophase.h"
 #include "access/xlog.h"
+#if PG_MAJORVERSION_NUM >= 15
+#include "access/xlogrecovery.h"
+#endif
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
@@ -432,6 +435,16 @@ _PG_init(void)
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
 
+	DefineCustomBoolVariable(
+							"neon.allow_replica_misconfig",
+							"Allow replica startup when some critical GUCs have smaller value than on primary node",
+							NULL,
+							&allowReplicaMisconfig,
+							true,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	DefineCustomEnumVariable(
 							"neon.running_xacts_overflow_policy",
 							"Action performed on snapshot overflow when restoring runnings xacts from CLOG",
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 043aff686b..6cb11b825d 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -4,6 +4,10 @@ import random
 import time
 from typing import TYPE_CHECKING
 
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import wait_replica_caughtup
+
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
 
@@ -19,8 +23,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                 p_cur.execute(
                     "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
                 )
-        time.sleep(1)
         with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
+            wait_replica_caughtup(primary, secondary)
             with primary.connect() as p_con:
                 with p_con.cursor() as p_cur:
                     with secondary.connect() as s_con:
@@ -42,3 +46,218 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                                 s_cur.execute(
                                     "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
                                 )
+
+
+def test_physical_replication_config_mismatch_max_connections(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_connections).
+    PostgreSQL enforces that settings that affect how many transactions can be open at the same time
+    have values equal to or higher in a hot standby replica than in the primary. If they don't, the replica refuses
+    to start up. If the settings are changed in the primary, it emits a WAL record with the new settings, and
+    when the replica sees that record it pauses the replay.
+
+    PostgreSQL enforces this to ensure that the replica can hold all the XIDs in the so-called
+    "known-assigned XIDs" array, which is a fixed size array that needs to be allocated
+    upfront and server startup. That's pretty pessimistic, though; usually you can get
+    away with smaller settings, because we allocate space for 64 subtransactions per
+    transaction too. If you get unlucky and you run out of space, WAL redo dies with
+    "ERROR: too many KnownAssignedXids". It's better to take the chances than refuse
+    to start up, especially in Neon: if the WAL redo dies, the server is restarted, which is
+    no worse than refusing to start up in the first place. Furthermore, the control plane
+    tries to ensure that on restart, the settings are set high enough, so most likely it will
+    work after restart. Because of that, we have patched Postgres to disable to checks when
+    the `recovery_pause_on_misconfig` setting is set to `false` (which is the default on neon).
+
+    This test tests all those cases of running out of space in known-assigned XIDs array that
+    we can hit with `recovery_pause_on_misconfig=false`, which are unreachable in unpatched
+    Postgres.
+    There's a similar check for `max_locks_per_transactions` too, which is related to running out
+    of space in the lock manager rather than known-assigned XIDs. Similar story with that, although
+    running out of space in the lock manager is possible in unmodified Postgres too. Enforcing the
+    check for `max_locks_per_transactions` ensures  that you don't run out of space in the lock manager
+    when there are no read-only queries holding locks in the replica, but you can still run out if you have
+    those.
+    """
+    env = neon_simple_env
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with primary.connect() as p_con:
+            with p_con.cursor() as p_cur:
+                p_cur.execute(
+                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
+                )
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=["max_connections=5"],
+        ) as secondary:
+            wait_replica_caughtup(primary, secondary)
+            with secondary.connect() as s_con:
+                with s_con.cursor() as s_cur:
+                    cursors = []
+                    for i in range(10):
+                        p_con = primary.connect()
+                        p_cur = p_con.cursor()
+                        p_cur.execute("begin")
+                        p_cur.execute("insert into t (pk) values (%s)", (i,))
+                        cursors.append(p_cur)
+
+                    for p_cur in cursors:
+                        p_cur.execute("commit")
+
+                    wait_replica_caughtup(primary, secondary)
+                    s_cur.execute("select count(*) from t")
+                    assert s_cur.fetchall()[0][0] == 10
+
+
+def test_physical_replication_config_mismatch_max_prepared(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_prepared_transactions).
+    If number of transactions at primary exceeds its limit at replica then WAL replay is terminated.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=["max_prepared_transactions=10"],
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["max_prepared_transactions=5"],
+    )
+    wait_replica_caughtup(primary, secondary)
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+    cursors = []
+    for i in range(10):
+        p_con = primary.connect()
+        p_cur = p_con.cursor()
+        p_cur.execute("begin")
+        p_cur.execute("insert into t (pk) values (%s)", (i,))
+        p_cur.execute(f"prepare transaction 't{i}'")
+        cursors.append(p_cur)
+
+    for i in range(10):
+        cursors[i].execute(f"commit prepared 't{i}'")
+
+    time.sleep(5)
+    with pytest.raises(Exception) as e:
+        s_cur.execute("select count(*) from t")
+        assert s_cur.fetchall()[0][0] == 10
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("maximum number of prepared transactions reached")
+
+
+def connect(ep):
+    max_reconnect_attempts = 10
+    for _ in range(max_reconnect_attempts):
+        try:
+            return ep.connect()
+        except Exception as e:
+            log.info(f"Failed to connect with primary: {e}")
+            time.sleep(1)
+
+
+def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_connections).
+    In this case large difference in this setting and larger number of concurrent transactions at primary
+    # cause too many known xids error  at replica.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=[
+            "max_connections=1000",
+            "shared_buffers=128MB",  # prevent "no unpinned buffers available" error
+        ],
+    )
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=[
+            "max_connections=2",
+            "autovacuum_max_workers=1",
+            "max_worker_processes=5",
+            "max_wal_senders=1",
+            "superuser_reserved_connections=0",
+        ],
+    )
+
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(x integer)")
+
+    n_connections = 990
+    cursors = []
+    for i in range(n_connections):
+        p_con = connect(primary)
+        p_cur = p_con.cursor()
+        p_cur.execute("begin")
+        p_cur.execute(f"insert into t values({i})")
+        cursors.append(p_cur)
+
+    for cur in cursors:
+        cur.execute("commit")
+
+    time.sleep(5)
+    with pytest.raises(Exception) as e:
+        s_con = secondary.connect()
+        s_cur = s_con.cursor()
+        s_cur.execute("select count(*) from t")
+        assert s_cur.fetchall()[0][0] == n_connections
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("too many KnownAssignedXids")
+
+
+def test_physical_replication_config_mismatch_max_locks_per_transaction(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_locks_per_transaction).
+    In  conjunction with different number of max_connections at primary and standby it can cause "out of shared memory"
+    error if the primary obtains more AccessExclusiveLocks than the standby can hold.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=[
+            "max_locks_per_transaction = 100",
+        ],
+    )
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=[
+            "max_connections=10",
+            "max_locks_per_transaction = 10",
+        ],
+    )
+
+    n_tables = 1000
+
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("begin")
+    for i in range(n_tables):
+        p_cur.execute(f"CREATE TABLE t_{i}(x integer)")
+    p_cur.execute("commit")
+
+    with pytest.raises(Exception) as e:
+        wait_replica_caughtup(primary, secondary)
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("You might need to increase")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c1989c934d..373f9decad 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb
+Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index d929b9a8b9..972e325e62 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6
+Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 13e9e35394..dff6615a8e 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd
+Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index faebe5e5af..a10d95be67 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit faebe5e5aff5687908504453623778f8515529db
+Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71
diff --git a/vendor/revisions.json b/vendor/revisions.json
index abeddcadf7..8a73e14dcf 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "faebe5e5aff5687908504453623778f8515529db"
+    "a10d95be67265e0f10a422ba0457f5a7af01de71"
   ],
   "v16": [
     "16.6",
-    "13e9e3539419003e79bd9aa29e1bc44f3fd555dd"
+    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
   ],
   "v15": [
     "15.10",
-    "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6"
+    "972e325e62b455957adbbdd8580e31275bb5b8c9"
   ],
   "v14": [
     "14.15",
-    "c1989c934d46e04e78b3c496c8a34bcd40ddceeb"
+    "373f9decad933d2d46f321231032ae8b0da81acd"
   ]
 }

From fae8e7ba76b134a06f7175eadb71c87038a1b399 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 1 Dec 2024 13:04:37 +0000
Subject: [PATCH 019/117] Compute image: prepare Postgres v14-v16 for Debian 12
 (#9954)

## Problem

Current compute images for Postgres 14-16 don't build on Debian 12
because of issues with extensions.
This PR fixes that, but for the current setup, it is mostly a no-op
change.

## Summary of changes
- Use `/bin/bash -euo pipefail` as SHELL to fail earlier
- Fix `plv8` build: backport a trivial patch for v8
- Fix `postgis` build: depend `sfgal` version on Debian version instead
of Postgres version


Tested in: https://github.com/neondatabase/neon/pull/9849
---
 compute/compute-node.Dockerfile   | 17 ++++++++-----
 compute/patches/plv8-3.1.10.patch | 42 +++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 6 deletions(-)
 create mode 100644 compute/patches/plv8-3.1.10.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 2fcd9985bc..9567018053 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -14,6 +14,9 @@ ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 FROM debian:$DEBIAN_FLAVOR AS build-deps
 ARG DEBIAN_VERSION
 
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
 RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
@@ -106,6 +109,7 @@ RUN cd postgres && \
 #
 #########################################################################################
 FROM build-deps AS postgis-build
+ARG DEBIAN_VERSION
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -122,12 +126,12 @@ RUN apt update && \
 # and also we must check backward compatibility with older versions of PostGIS.
 #
 # Use new version only for v17
-RUN case "${PG_VERSION}" in \
-    "v17") \
+RUN case "${DEBIAN_VERSION}" in \
+    "bookworm") \
         export SFCGAL_VERSION=1.4.1 \
         export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \
     ;; \
-    "v14" | "v15" | "v16") \
+    "bullseye") \
         export SFCGAL_VERSION=1.3.10 \
         export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \
     ;; \
@@ -228,6 +232,8 @@ FROM build-deps AS plv8-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
+
 RUN apt update && \
     apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
 
@@ -239,8 +245,6 @@ RUN apt update && \
 #
 # Use new version only for v17
 # because since v3.2, plv8 doesn't include plcoffee and plls extensions
-ENV PLV8_TAG=v3.2.3
-
 RUN case "${PG_VERSION}" in \
     "v17") \
         export PLV8_TAG=v3.2.3 \
@@ -255,8 +259,9 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
+    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \
     # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8-3.1.10.patch
new file mode 100644
index 0000000000..43cdb479f7
--- /dev/null
+++ b/compute/patches/plv8-3.1.10.patch
@@ -0,0 +1,42 @@
+commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
+Author: Alexander Bayandin <alexander@neon.tech>
+Date:   Sat Nov 30 18:29:32 2024 +0000
+
+    Fix v8 9.7.37 compilation on Debian 12
+
+diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
+new file mode 100644
+index 0000000..f0a5dc7
+--- /dev/null
++++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
+@@ -0,0 +1,30 @@
++From 84cf3230a9680aac3b73c410c2b758760b6d3066 Mon Sep 17 00:00:00 2001
++From: Michael Lippautz <mlippautz@chromium.org>
++Date: Thu, 27 Jan 2022 14:14:11 +0100
++Subject: [PATCH] cppgc: Fix include
++
++Add <utility> to cover for std::exchange.
++
++Bug: v8:12585
++Change-Id: Ida65144e93e466be8914527d0e646f348c136bcb
++Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3420309
++Auto-Submit: Michael Lippautz <mlippautz@chromium.org>
++Reviewed-by: Omer Katz <omerkatz@chromium.org>
++Commit-Queue: Michael Lippautz <mlippautz@chromium.org>
++Cr-Commit-Position: refs/heads/main@{#78820}
++---
++ src/heap/cppgc/prefinalizer-handler.h | 1 +
++ 1 file changed, 1 insertion(+)
++
++diff --git a/src/heap/cppgc/prefinalizer-handler.h b/src/heap/cppgc/prefinalizer-handler.h
++index bc17c99b1838..c82c91ff5a45 100644
++--- a/src/heap/cppgc/prefinalizer-handler.h
+++++ b/src/heap/cppgc/prefinalizer-handler.h
++@@ -5,6 +5,7 @@
++ #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
++ #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
++ 
+++#include <utility>
++ #include <vector>
++ 
++ #include "include/cppgc/prefinalizer.h"

From aad809b048afbc86c2ffe48461e75f9e6d6fe3fb Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 1 Dec 2024 17:47:28 +0200
Subject: [PATCH 020/117] Fix issues with prefetch ring buffer resize (#9847)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1732110190129479


We observe the following error in the logs
```
[XX000] ERROR: [NEON_SMGR] [shard 3] Incorrect prefetch read: status=1 response=0x7fafef335138 my=128 receive=128
```
most likely caused by changing `neon.readahead_buffer_size`

## Summary of changes

1. Copy shard state
2. Do not use prefetch_set_unused in readahead_buffer_resize
3. Change prefetch buffer overflow criteria

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c                    | 13 +++++-
 .../regress/test_prefetch_buffer_resize.py    | 40 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/regress/test_prefetch_buffer_resize.py

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index cbb0e2ae6d..a5e0c402fb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -439,6 +439,8 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;
+	newPState->max_shard_no = MyPState->max_shard_no;
+	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
 
 	/*
 	 * Copy over the prefetches.
@@ -495,7 +497,11 @@ readahead_buffer_resize(int newsize, void *extra)
 
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
-		prefetch_set_unused(end);
+		PrefetchRequest *slot = GetPrfSlot(end);
+		if (slot->status == PRFS_RECEIVED)
+		{
+			pfree(slot->response);
+		}
 	}
 
 	prfh_destroy(MyPState->prf_hash);
@@ -944,6 +950,9 @@ Retry:
 		Assert(entry == NULL);
 		Assert(slot == NULL);
 
+		/* There should be no buffer overflow */
+		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
+
 		/*
 		 * If the prefetch queue is full, we need to make room by clearing the
 		 * oldest slot. If the oldest slot holds a buffer that was already
@@ -958,7 +967,7 @@ Retry:
 		 * a prefetch request kind of goes against the principles of
 		 * prefetching)
 		 */
-		if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
+		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
 		{
 			uint64		cleanup_index = MyPState->ring_last;
 
diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py
new file mode 100644
index 0000000000..7676b78b0e
--- /dev/null
+++ b/test_runner/regress/test_prefetch_buffer_resize.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import random
+
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.timeout(600)
+def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count,
+    )
+    n_iter = 10
+    n_rec = 100000
+
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers=10MB",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+
+    cur.execute("CREATE TABLE t(pk integer, filler text default repeat('?', 200))")
+    cur.execute(f"insert into t (pk) values (generate_series(1,{n_rec}))")
+
+    cur.execute("set statement_timeout=0")
+    cur.execute("set effective_io_concurrency=20")
+    cur.execute("set max_parallel_workers_per_gather=0")
+
+    for _ in range(n_iter):
+        buf_size = random.randrange(16, 32)
+        cur.execute(f"set neon.readahead_buffer_size={buf_size}")
+        limit = random.randrange(1, n_rec)
+        cur.execute(f"select sum(pk) from (select pk from t limit {limit}) s")

From 14853a32846fdf0c571f0f13c9d83315b2974dd6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 1 Dec 2024 18:09:58 +0000
Subject: [PATCH 021/117] storcon: don't take any Service locks in /status and
 /ready (#9944)

## Problem

We saw unexpected container terminations when running in k8s with with
small CPU resource requests.

The /status and /ready handlers called `maybe_forward`, which always
takes the lock on Service::inner.

If there is a lot of writer lock contention, and the container is
starved of CPU, this increases the likelihood that we will get killed by
the kubelet.

It isn't certain that this was a cause of issues, but it is a potential
source that we can eliminate.

## Summary of changes

- Revise logic to return immediately if the URL is in the non-forwarded
list, rather than calling maybe_forward
---
 storage_controller/src/http.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 9b5d4caf31..39e078ba7c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1452,10 +1452,15 @@ async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
     let uri = req.uri().to_string();
     let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
 
+    // Fast return before trying to take any Service locks, if we will never forward anyway
+    if !uri_for_forward {
+        return ForwardOutcome::NotForwarded(req);
+    }
+
     let state = get_state(&req);
     let leadership_status = state.service.get_leadership_status();
 
-    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
+    if leadership_status != LeadershipStatus::SteppedDown {
         return ForwardOutcome::NotForwarded(req);
     }
 

From 45658ccccba609053f6a27948e066a5de3fd8ab5 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 2 Dec 2024 10:10:51 +0000
Subject: [PATCH 022/117] Update pgvector to 0.8.0 (#9733)

---
 compute/compute-node.Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9567018053..222a0cb88b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -358,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# vector 0.7.4 supports v17
-# last release v0.7.4 - Aug 5, 2024
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
-    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
+# vector >0.7.4 supports v17
+# last release v0.8.0 - Oct 30, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
+    echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From 533012204913edf56fbcfc723e878b92fea5d93c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 2 Dec 2024 12:26:15 +0200
Subject: [PATCH 023/117] test_runner: improve `wait_until` (#9936)

Improves `wait_until` by:

* Use `timeout` instead of `iterations`. This allows changing the
timeout/interval parameters independently.
* Make `timeout` and `interval` optional (default 20s and 0.5s). Most
callers don't care.
* Only output status every 1s by default, and add optional
`status_interval` parameter.
* Remove `show_intermediate_error`, this was always emitted anyway.

Most callers have been updated to use the defaults, except where they
had good reason otherwise.
---
 test_runner/fixtures/neon_fixtures.py         | 14 ++--
 test_runner/fixtures/pageserver/utils.py      | 17 +----
 test_runner/fixtures/safekeeper/http.py       |  2 +-
 test_runner/fixtures/safekeeper/utils.py      |  2 +-
 test_runner/fixtures/utils.py                 | 35 +++++-----
 test_runner/logical_repl/test_clickhouse.py   |  6 +-
 test_runner/logical_repl/test_debezium.py     | 12 +---
 .../performance/test_branch_creation.py       |  8 +--
 .../regress/test_attach_tenant_config.py      |  2 -
 test_runner/regress/test_compaction.py        |  2 +-
 .../regress/test_disk_usage_eviction.py       | 10 +--
 test_runner/regress/test_hot_standby.py       |  6 +-
 .../regress/test_layers_from_future.py        |  2 +-
 test_runner/regress/test_logging.py           |  4 +-
 .../regress/test_logical_replication.py       |  6 +-
 test_runner/regress/test_lsn_mapping.py       |  4 +-
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_ondemand_download.py | 16 ++---
 test_runner/regress/test_pageserver_api.py    | 12 +---
 .../regress/test_pageserver_generations.py    | 14 ++--
 .../test_pageserver_getpage_throttle.py       |  7 +-
 .../regress/test_pageserver_layer_rolling.py  | 16 ++---
 .../regress/test_pageserver_restart.py        |  2 +-
 .../regress/test_pageserver_secondary.py      |  6 +-
 test_runner/regress/test_readonly_node.py     |  2 -
 test_runner/regress/test_remote_storage.py    | 42 ++++++-----
 test_runner/regress/test_replica_start.py     |  2 +-
 test_runner/regress/test_sharding.py          | 18 ++---
 .../regress/test_storage_controller.py        | 70 +++++++++----------
 test_runner/regress/test_storage_scrubber.py  |  2 -
 .../regress/test_subscriber_restart.py        |  2 +-
 test_runner/regress/test_tenant_conf.py       |  6 +-
 test_runner/regress/test_tenant_delete.py     | 14 ++--
 test_runner/regress/test_tenant_detach.py     |  8 +--
 test_runner/regress/test_tenant_relocation.py |  6 +-
 test_runner/regress/test_tenant_size.py       |  4 +-
 test_runner/regress/test_tenant_tasks.py      |  2 +-
 test_runner/regress/test_tenants.py           |  2 +-
 .../test_tenants_with_remote_storage.py       | 12 +---
 test_runner/regress/test_timeline_archive.py  |  6 +-
 test_runner/regress/test_timeline_delete.py   | 61 +++++-----------
 .../regress/test_timeline_detach_ancestor.py  | 30 ++++----
 .../regress/test_timeline_gc_blocking.py      |  2 +-
 test_runner/regress/test_timeline_size.py     | 36 +++++-----
 test_runner/regress/test_wal_acceptor.py      | 22 +++---
 test_runner/regress/test_wal_receiver.py      |  4 +-
 46 files changed, 234 insertions(+), 326 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9bcfffeb9c..5709a3b82b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1736,7 +1736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         def storage_controller_ready():
             assert self.ready() is True
 
-        wait_until(30, 1, storage_controller_ready)
+        wait_until(storage_controller_ready)
         return time.time() - t1
 
     def attach_hook_issue(
@@ -2574,7 +2574,7 @@ class NeonPageserver(PgProtocol, LogUtils):
             log.info(f"any_unstable={any_unstable}")
             assert not any_unstable
 
-        wait_until(20, 0.5, complete)
+        wait_until(complete)
 
     def __enter__(self) -> Self:
         return self
@@ -3973,7 +3973,7 @@ class Endpoint(PgProtocol, LogUtils):
                 migration_id: int = cur.fetchall()[0][0]
                 assert migration_id >= num_migrations
 
-            wait_until(20, 0.5, check_migrations_done)
+            wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
     # endpooint.rs adds content of this file as a part of the spec.json
@@ -4489,12 +4489,10 @@ class Safekeeper(LogUtils):
             )
             assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
 
-        # xxx: max wait is long because we might be waiting for reconnection from
-        # pageserver to this safekeeper
-        wait_until(30, 1, are_lsns_advanced)
+        wait_until(are_lsns_advanced)
         client.checkpoint(tenant_id, timeline_id)
         if wait_wal_removal:
-            wait_until(30, 1, are_segments_removed)
+            wait_until(are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"
@@ -4503,7 +4501,7 @@ class Safekeeper(LogUtils):
             log.info(f"waiting for hitting failpoint {failpoint}")
             self.assert_log_contains(msg)
 
-        wait_until(20, 0.5, paused)
+        wait_until(paused)
 
 
 class NeonBroker(LogUtils):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 46700e3fe3..7c10edc5fc 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -13,7 +13,7 @@ from mypy_boto3_s3.type_defs import (
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import RemoteStorage, S3Storage
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
@@ -269,12 +269,7 @@ def wait_timeline_detail_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int,
-    interval: float | None = None,
 ):
-    if interval is None:
-        interval = 0.25
-
     def timeline_is_missing():
         data = {}
         try:
@@ -287,19 +282,17 @@ def wait_timeline_detail_404(
 
         raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    wait_until(iterations, interval, func=timeline_is_missing)
+    wait_until(timeline_is_missing)
 
 
 def timeline_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int = 20,
-    interval: float | None = None,
     **delete_args,
 ) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
@@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]:
         "checkpoint_distance": 1024**2,
         "image_creation_threshold": 100,
     }
-
-
-def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 094188c0b5..286f80ba69 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -175,7 +175,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
             assert s > Lsn(0)
             return s
 
-        return wait_until(30, 1, timeline_start_lsn_non_zero)
+        return wait_until(timeline_start_lsn_non_zero)
 
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 0246916470..922cdedccc 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -19,4 +19,4 @@ def wait_walreceivers_absent(
         log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
         assert len(status.walreceivers) == 0
 
-    wait_until(30, 0.5, walreceivers_absent)
+    wait_until(walreceivers_absent)
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 04e98fe494..c34ac298d1 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -9,6 +9,7 @@ import tarfile
 import threading
 import time
 from collections.abc import Callable, Iterable
+from datetime import datetime, timedelta
 from hashlib import sha256
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar
@@ -380,15 +381,10 @@ def start_in_background(
             if return_code is not None:
                 error = f"expected subprocess to run but it exited with code {return_code}"
             else:
-                attempts = 10
                 try:
-                    wait_until(
-                        number_of_iterations=attempts,
-                        interval=1,
-                        func=is_started,
-                    )
+                    wait_until(is_started, timeout=10)
                 except Exception:
-                    error = f"Failed to get correct status from subprocess in {attempts} attempts"
+                    error = "Failed to get correct status from subprocess"
         except Exception as e:
             error = f"expected subprocess to start but it failed with exception: {e}"
 
@@ -402,28 +398,31 @@ def start_in_background(
 
 
 def wait_until(
-    number_of_iterations: int,
-    interval: float,
     func: Callable[[], WaitUntilRet],
-    show_intermediate_error: bool = False,
+    name: str | None = None,
+    timeout: float = 20.0,  # seconds
+    interval: float = 0.5,  # seconds
+    status_interval: float = 1.0,  # seconds
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
     """
+    if name is None:
+        name = getattr(func, "__name__", repr(func))
+    deadline = datetime.now() + timedelta(seconds=timeout)
+    next_status = datetime.now()
     last_exception = None
-    for i in range(number_of_iterations):
+    while datetime.now() <= deadline:
         try:
-            res = func()
+            return func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
+            if datetime.now() >= next_status:
+                log.info("waiting for %s: %s", name, e)
+                next_status = datetime.now() + timedelta(seconds=status_interval)
             last_exception = e
-            if show_intermediate_error:
-                log.info(e)
             time.sleep(interval)
-            continue
-        return res
-    raise Exception(f"timed out while waiting for {func}") from last_exception
+    raise Exception(f"timed out while waiting for {name}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py
index 8e03bbe5d4..6b522fa46d 100644
--- a/test_runner/logical_repl/test_clickhouse.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres):
         "SETTINGS materialized_postgresql_tables_list = 'table1';"
     )
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
         ),
+        timeout=60,
     )
     cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
     conn.commit()
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
         ),
+        timeout=60,
     )
     log.debug("Sleeping before final checking if Neon is still alive")
     time.sleep(3)
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index d2cb087c92..8023d64d3d 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -148,14 +148,12 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Insert 2 ts_ms: %s", ts_ms)
@@ -165,28 +163,24 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Update ts_ms: %s", ts_ms)
     cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alexander"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     time.sleep(3)
     cur.execute("select 1")
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index c50c4ad432..3ce27d6cd3 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -137,7 +137,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     startup_line = "INFO version: git(-env)?:"
 
     # find the first line of the log file so we can find the next start later
-    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+    _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line))
 
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
@@ -156,7 +156,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     )
 
     _, second_start = wait_until(
-        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+        lambda: env.pageserver.assert_log_contains(startup_line, first_start),
     )
     env.pageserver.quiesce_tenants()
 
@@ -164,8 +164,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
 
     # wait for compaction to complete, which most likely has already done so multiple times
     msg, _ = wait_until(
-        30,
-        1,
         lambda: env.pageserver.assert_log_contains(
             f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
         ),
@@ -205,7 +203,7 @@ def wait_and_record_startup_metrics(
         assert len(matching) == len(expected_labels)
         return matching
 
-    samples = wait_until(10, 1, metrics_are_filled)
+    samples = wait_until(metrics_are_filled)
 
     for sample in samples:
         phase = sample.labels["phase"]
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 670c2698f5..45112fd67e 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
     )
 
     wait_until(
-        50,
-        0.1,
         lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
     )
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 302a8fd0d1..b6741aed68 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -385,7 +385,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
 
     # Wait for enough failures to break the circuit breaker
     # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
-    wait_until(60, 1, assert_broken)
+    wait_until(assert_broken, timeout=60)
 
     # Sleep for a while, during which time we expect that compaction will _not_ be retried
     time.sleep(10)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 1807511008..05956b5b93 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -211,7 +211,7 @@ class EvictionEnv:
             pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
-        wait_until(10, 1, statvfs_called)
+        wait_until(statvfs_called)
 
 
 def count_layers_per_tenant(
@@ -772,14 +772,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
     )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
         assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    wait_until(2, 2, less_than_max_usage_pct)
+    wait_until(less_than_max_usage_pct, timeout=5)
 
     # Disk usage candidate collection only takes into account active tenants.
     # However, the statvfs call takes into account the entire tenants directory,
@@ -825,7 +825,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"),
     )
 
     def more_than_min_avail_bytes_freed():
@@ -834,7 +834,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             total_size - post_eviction_total_size >= min_avail_bytes
         ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    wait_until(2, 2, more_than_min_avail_bytes_freed)
+    wait_until(more_than_min_avail_bytes_freed, timeout=5)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 0b1ac11c16..4044f25b37 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             # Wait until we see that the pgbench_accounts is created + filled on replica *and*
             # index is created. Otherwise index creation would conflict with
             # read queries and hs feedback won't save us.
-            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+            wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60)
 
             # Test should fail if hs feedback is disabled anyway, but cross
             # check that walproposer sets some xmin.
@@ -269,7 +269,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
                 log.info(f"xmin is {slot_xmin}")
                 assert int(slot_xmin) > 0
 
-            wait_until(10, 1.0, xmin_is_not_null)
+            wait_until(xmin_is_not_null)
             for _ in range(1, 5):
                 # in debug mode takes about 5-7s
                 balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
@@ -286,7 +286,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             log.info(f"xmin is {slot_xmin}")
             assert slot_xmin is None
 
-        wait_until(10, 1.0, xmin_is_null)
+        wait_until(xmin_is_null)
 
 
 # Test race condition between WAL replay and backends performing queries
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 761ec7568f..8818b40712 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -206,7 +206,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
         future_layers = set(get_future_layers())
         assert future_layer not in future_layers
 
-    wait_until(10, 0.5, future_layer_is_gone_from_index_part)
+    wait_until(future_layer_is_gone_from_index_part)
 
     # We already make deletion stuck here, but we don't necessarily hit the failpoint
     # because deletions are batched.
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index f6fbdcabfd..d94c786f49 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -37,7 +37,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
             return
         env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
-    wait_until(10, 0.5, assert_logged)
+    wait_until(assert_logged)
 
     # make sure it's counted
     def assert_metric_value():
@@ -49,4 +49,4 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
         log.info("libmetrics_tracing_event_count: %s", val)
         assert val > (before or 0.0)
 
-    wait_until(10, 1, assert_metric_value)
+    wait_until(assert_metric_value)
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ba471b7147..db18e1758c 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -207,7 +207,7 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgre
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
-    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
+    wait_until(partial(slot_removed, endpoint))
 
 
 def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
@@ -519,7 +519,7 @@ def test_replication_shutdown(neon_simple_env: NeonEnv):
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
 
 def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
@@ -549,7 +549,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
         )
         assert flush_lsn >= publisher_flush_lsn
 
-    wait_until(30, 0.5, check_caughtup)
+    wait_until(check_caughtup)
     return publisher_flush_lsn
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 7f0b541128..e42e71646d 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         with pytest.raises(ReadTimeout):
@@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         client.configure_failpoints((failpoint, "off"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "Cancelled request finished with an error: Cancelled$", offset
             ),
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 7118127a1f..49cd91906f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -77,7 +77,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
         # Test that pg_monitor is working for neon_superuser role
         cur.execute("SELECT query from pg_stat_activity LIMIT 1")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index e1caaeb6c1..028d1c2e49 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -256,7 +256,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     # The current_physical_size reports the sum of layers loaded in the layer
     # map, regardless of where the layer files are located. So even though we
@@ -413,7 +413,7 @@ def test_download_remote_layers_api(
         ]
     )
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     ###### Phase 1: exercise download error code path
 
@@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         location_conf = {"mode": "Detached", "tenant_conf": {}}
@@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "closing is taking longer than expected", offset
             ),
@@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         client.configure_failpoints((failpoint, "pause"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
@@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
         )
 
@@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
         # ensure enough time while paused to trip the timeout
         time.sleep(2)
@@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
 
         # capture the next offset for a new synchronization with the failpoint
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 05e81b82e0..55fd7a8608 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
         # We need to wait here because it's possible that we don't have access to
         # the latest WAL yet, when the `timeline_detail` API is first called.
         # See: https://github.com/neondatabase/neon/issues/1768.
-        lsn = wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None),
-        )
+        lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
 
         # Make a DB modification then expect getting a new WAL receiver's data.
         endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
-        wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn),
-        )
+        wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
 
 
 def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 6ba5753420..7e5bb45242 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -352,7 +352,7 @@ def test_deletion_queue_recovery(
         def assert_some_validations():
             assert get_deletion_queue_validated(ps_http) > 0
 
-        wait_until(20, 1, assert_some_validations)
+        wait_until(assert_some_validations)
 
         # The validatated keys statistic advances before the header is written, so we
         # also wait to see the header hit the disk: this seems paranoid but the race
@@ -360,7 +360,7 @@ def test_deletion_queue_recovery(
         def assert_header_written():
             assert (main_pageserver.workdir / "deletion" / "header-01").exists()
 
-        wait_until(20, 1, assert_header_written)
+        wait_until(assert_header_written)
 
         # If we will lose attachment, then our expectation on restart is that only the ones
         # we already validated will execute.  Act like only those were present in the queue.
@@ -382,11 +382,11 @@ def test_deletion_queue_recovery(
     # After restart, issue a flush to kick the deletion frontend to do recovery.
     # It should recover all the operations we submitted before the restart.
     ps_http.deletion_queue_flush(execute=False)
-    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+    wait_until(lambda: assert_deletions_submitted(before_restart_depth))
 
     # The queue should drain through completely if we flush it
     ps_http.deletion_queue_flush(execute=True)
-    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+    wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
 
     if keep_attachment == KeepAttachment.KEEP:
         # - If we kept the attachment, then our pre-restart deletions should execute
@@ -564,7 +564,7 @@ def test_multi_attach(
     )
 
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
     with pytest.raises(PageserverApiException):
         http_clients[1].timeline_detail(tenant_id, timeline_id)
@@ -579,8 +579,8 @@ def test_multi_attach(
     pageservers[1].tenant_attach(env.initial_tenant)
     pageservers[2].tenant_attach(env.initial_tenant)
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
 
     # Now they all have it attached
     _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index f1aad85fe9..ba6a1d9045 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -81,9 +81,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
-    _, marker_offset = wait_until(
-        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
-    )
+    _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
     duration_secs = 10
@@ -103,12 +101,11 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     log.info("validate that we logged the throttling")
 
     wait_until(
-        10,
-        compaction_period / 10,
         lambda: env.pageserver.assert_log_contains(
             f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
             offset=marker_offset,
         ),
+        timeout=compaction_period,
     )
 
     log.info("validate that the metric doesn't include throttle wait time")
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index f6a7bfa1ad..706da1e35e 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -84,7 +84,7 @@ def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
     # The metric gets initialised on the first update.
     # Retry a few times, but return 0 if it's stable.
     try:
-        return float(wait_until(3, 0.5, query))
+        return float(wait_until(query, timeout=2, interval=0.5))
     except Exception:
         return 0
 
@@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers(
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     ps_http_client = env.pageserver.http_client()
     total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
@@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers(
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
     # must be uploaded to remain visible to the pageserver after restart.
@@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # Stop the safekeepers, so that we cannot have any more WAL receiver connections
     for sk in env.safekeepers:
@@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # The code below verifies that we do not flush on the first write
     # after an idle period longer than the checkpoint timeout.
@@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
     )
 
-    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # We shouldn't flush since we've just opened a new layer
     waited_for = 0
@@ -305,11 +305,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     # Wait until enough layers have rolled that the amount of dirty data is under the threshold.
     # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
     # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
-    wait_until(compaction_period_s * 2, 1, assert_bytes_rolled)
+    wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s)
 
     # The end state should also have the reported metric under the limit
     def assert_dirty_data_limited():
         dirty_bytes = get_dirty_bytes(env)
         assert dirty_bytes < max_dirty_data
 
-    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())
+    wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s)
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 4bf5705517..835ccbd5d4 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -103,7 +103,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 
         raise AssertionError("No 'complete' metric yet")
 
-    wait_until(30, 1.0, assert_complete)
+    wait_until(assert_complete)
 
     # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
     expectations = [
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index a264f4d3c9..1292682f9e 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -356,7 +356,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         )
         assert destination_lsn >= origin_lsn
 
-    wait_until(100, 0.1, caught_up)
+    wait_until(caught_up)
 
     # The destination should accept writes
     workload.churn_rows(64, pageserver_b.id)
@@ -411,7 +411,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         assert submitted is not None
         assert submitted > 0
 
-    wait_until(10, 0.1, blocked_deletions_drained)
+    wait_until(blocked_deletions_drained)
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
@@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         else:
             timeout = int(deadline - now) + 1
             try:
-                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))
+                wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout)
             except:
                 log.error(f"Timed out waiting for '{expression}'")
                 raise
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 70d558ac5a..c13bea7ee1 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -215,8 +215,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 
         # wait for lease renewal before running query.
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: ep_static.assert_log_contains(
                 "lsn_lease_bg_task.*Request succeeded", offset=offset
             ),
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 137e75f784..76a42ef4a2 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -300,9 +300,9 @@ def test_remote_storage_upload_queue_retries(
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -333,16 +333,28 @@ def test_remote_storage_upload_queue_retries(
     # wait for churn thread's data to get stuck in the upload queue
     # Exponential back-off in upload queue, so, gracious timeouts.
 
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -580,7 +592,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             > 0
         )
 
-    wait_until(200, 0.1, assert_compacted_and_uploads_queued)
+    wait_until(assert_compacted_and_uploads_queued)
 
     # Regardless, give checkpoint some time to block for good.
     # Not strictly necessary, but might help uncover failure modes in the future.
@@ -598,9 +610,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
         ]
     )
 
-    # Generous timeout, because currently deletions can get blocked waiting for compaction
-    # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
-    timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)
+    timeline_delete_wait_completed(client, tenant_id, timeline_id)
 
     assert not timeline_path.exists()
 
@@ -826,22 +836,16 @@ def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
         ),
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index 8e7c01f950..e2a22cc769 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -378,7 +378,7 @@ def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
             return None
         raise RuntimeError("connection succeeded")
 
-    wait_until(20, 0.5, check_replica_crashed)
+    wait_until(check_replica_crashed)
     assert secondary.log_contains("too many KnownAssignedXids")
 
     # Replica is crashed, so ignore stop result
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 411574bd86..c86ba0d4ea 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -836,7 +836,7 @@ def test_sharding_split_stripe_size(
         assert len(notifications) == 3
         assert notifications[2] == expect_after
 
-    wait_until(10, 1, assert_restart_notification)
+    wait_until(assert_restart_notification)
 
 
 # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
@@ -1025,7 +1025,7 @@ def test_sharding_ingest_gaps(
             assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent)
+    wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs)
 
     def assert_all_remote_consistent():
         """
@@ -1037,7 +1037,7 @@ def test_sharding_ingest_gaps(
             assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent)
+    wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs)
 
     workload.validate()
 
@@ -1405,14 +1405,14 @@ def test_sharding_split_failures(
         #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
         #   inside the pageserver and the storage controller responds by detaching children and attaching
         #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
-        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))
+        wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False))
 
         workload.validate()
 
     if failure.fails_forward(env):
         log.info("Fail-forward failure, checking split eventually completes...")
         # A failure type which results in eventual completion of the split
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     elif failure.can_mitigate():
         log.info("Mitigating failure...")
         # Mitigation phase: we expect to be able to proceed with a successful shard split
@@ -1420,21 +1420,21 @@ def test_sharding_split_failures(
 
         # The split should appear to be rolled back from the point of view of all pageservers
         # apart from the one that is offline
-        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
 
         finish_split()
-        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
 
         # Having cleared the failure, everything should converge to a pristine state
         failure.clear(env)
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     else:
         # Once we restore the faulty pageserver's API to good health, rollback should
         # eventually complete.
         log.info("Clearing failure...")
         failure.clear(env)
 
-        wait_until(30, 1, assert_rolled_back)
+        wait_until(assert_rolled_back)
 
         # Having rolled back, the tenant should be working
         workload.churn_rows(10)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13bc54a114..e93e251b4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -154,7 +154,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Let all the reconciliations after marking the node offline complete
     env.storage_controller.reconcile_until_idle()
@@ -222,7 +222,7 @@ def test_node_status_after_restart(
     def is_ready():
         assert env.storage_controller.ready() is True
 
-    wait_until(30, 1, is_ready)
+    wait_until(is_ready)
 
     # We loaded nodes from database on restart
     nodes = env.storage_controller.node_list()
@@ -606,7 +606,7 @@ def test_storage_controller_compute_hook(
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Additional notification from migration
     log.info(f"notifications: {notifications}")
@@ -620,7 +620,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 2
         assert notifications[1] == expect
 
-    wait_until(20, 0.25, received_migration_notification)
+    wait_until(received_migration_notification)
 
     # When we restart, we should re-emit notifications for all tenants
     env.storage_controller.stop()
@@ -630,7 +630,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 3
         assert notifications[2] == expect
 
-    wait_until(10, 1, received_restart_notification)
+    wait_until(received_restart_notification)
 
     # Splitting a tenant should cause its stripe size to become visible in the compute notification
     env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
@@ -647,7 +647,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 4
         assert notifications[3] == expect
 
-    wait_until(10, 1, received_split_notification)
+    wait_until(received_split_notification)
 
     # If the compute hook is unavailable, that should not block creating a tenant and
     # creating a timeline.  This simulates a control plane refusing to accept notifications
@@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook(
         def logged_stuck():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG)
 
-        wait_until(10, 0.25, logged_stuck)
+        wait_until(logged_stuck)
         contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG)
         assert contains_r is not None  # Appease mypy
         (_, log_cursor) = contains_r
@@ -764,7 +764,7 @@ def test_storage_controller_stuck_compute_hook(
         def logged_stuck_again():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
 
-        wait_until(10, 0.25, logged_stuck_again)
+        wait_until(logged_stuck_again)
         assert migrate_fut.running()
 
         # This time, the compute hook remains stuck, but we mark the origin node offline: this should
@@ -865,7 +865,7 @@ def test_storage_controller_compute_hook_revert(
         assert latest["shards"] is not None
         assert latest["shards"][0]["node_id"] == ps_id
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
@@ -880,7 +880,7 @@ def test_storage_controller_compute_hook_revert(
 
     # Although the migration API failed, the hook should still see pageserver B (it remembers what
     # was posted even when returning an error code)
-    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+    wait_until(lambda: notified_ps(pageserver_b.id))
 
     # Although the migration API failed, the tenant should still have moved to the right pageserver
     assert len(pageserver_b.http_client().tenant_list()) == 1
@@ -898,7 +898,7 @@ def test_storage_controller_compute_hook_revert(
     def logged_giving_up():
         env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
 
-    wait_until(30, 1, logged_giving_up)
+    wait_until(logged_giving_up)
 
     pageserver_a.start()
 
@@ -919,7 +919,7 @@ def test_storage_controller_compute_hook_revert(
     handle_params["status"] = 200
     env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
 
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -1453,7 +1453,7 @@ def test_storage_controller_heartbeats(
         # Check that each node got one tenant
         assert all(len(ts) == 1 for ts in node_to_tenants.values())
 
-    wait_until(10, 1, tenants_placed)
+    wait_until(tenants_placed)
 
     # ... then we apply the failure
     offline_node_ids = set(failure.nodes())
@@ -1476,7 +1476,7 @@ def test_storage_controller_heartbeats(
                 assert node["availability"] == "Offline"
 
     start = time.time()
-    wait_until(failure.offline_timeout, 1, nodes_offline)
+    wait_until(nodes_offline, timeout=failure.offline_timeout)
     detected_after = time.time() - start
     log.info(f"Detected node failures after {detected_after}s")
 
@@ -1497,7 +1497,7 @@ def test_storage_controller_heartbeats(
 
         assert observed_tenants == set(tenant_ids)
 
-    wait_until(10, 1, tenant_migrated)
+    wait_until(tenant_migrated)
 
     # ... then we clear the failure
     failure.clear(env)
@@ -1509,7 +1509,7 @@ def test_storage_controller_heartbeats(
             if node["id"] in online_node_ids:
                 assert node["availability"] == "Active"
 
-    wait_until(10, 1, nodes_online)
+    wait_until(nodes_online)
 
     time.sleep(5)
 
@@ -1562,7 +1562,7 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
     # We could pre-empty this by configuring the node to Offline, but it's preferable to test
     # the realistic path we would take when a node restarts uncleanly.
     # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
-    wait_until(30, 1, failed_over)
+    wait_until(failed_over)
 
     reconciles_before_restart = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -1640,12 +1640,12 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
         assert e > n
         return e
 
-    errs = wait_until(10, 1, lambda: assert_errors_gt(0))
+    errs = wait_until(lambda: assert_errors_gt(0))
 
     # Try reconciling again, it should fail again
     with pytest.raises(StorageControllerApiException):
         env.storage_controller.reconcile_all()
-    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
+    errs = wait_until(lambda: assert_errors_gt(errs))
 
     # Configure the tenant to disable reconciles
     env.storage_controller.tenant_policy_update(
@@ -1674,7 +1674,7 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
         return o
 
     # We should see a successful reconciliation
-    wait_until(10, 1, lambda: assert_ok_gt(0))
+    wait_until(lambda: assert_ok_gt(0))
 
     # And indeed the tenant should be attached
     assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
@@ -2073,7 +2073,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
             raise Exception(f"Secondary lag not big enough: {lag}")
 
     log.info(f"Looking for lag to develop on the secondary {secondary}")
-    wait_until(10, 1, secondary_is_lagging)
+    wait_until(secondary_is_lagging)
 
     log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
     env.storage_controller.retryable_node_operation(
@@ -2107,7 +2107,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
         if lag > 1 * 1024 * 1024:
             raise Exception(f"Secondary lag not big enough: {lag}")
 
-    wait_until(10, 1, lag_is_acceptable)
+    wait_until(lag_is_acceptable)
 
     env.storage_controller.node_configure(primary, {"scheduling": "Active"})
 
@@ -2227,7 +2227,7 @@ def test_storage_controller_node_deletion(
             log.info(f"Shards on nodes other than on victim: {elsewhere}")
             assert elsewhere == tenant_count * shard_count_per_tenant
 
-        wait_until(30, 1, assert_shards_migrated)
+        wait_until(assert_shards_migrated)
 
     log.info(f"Deleting pageserver {victim.id}")
     env.storage_controller.node_delete(victim.id)
@@ -2240,7 +2240,7 @@ def test_storage_controller_node_deletion(
             log.info(f"Shards on node {victim.id}: {count}")
             assert count == 0
 
-        wait_until(30, 1, assert_victim_evacuated)
+        wait_until(assert_victim_evacuated)
 
     # The node should be gone from the list API
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
@@ -2569,7 +2569,7 @@ def test_storage_controller_leadership_transfer(
                 == StorageControllerLeadershipStatus.STEPPED_DOWN
             )
 
-        wait_until(5, 1, previous_stepped_down)
+        wait_until(previous_stepped_down)
 
     storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
 
@@ -2579,7 +2579,7 @@ def test_storage_controller_leadership_transfer(
             == StorageControllerLeadershipStatus.LEADER
         )
 
-    wait_until(15, 1, new_becomes_leader)
+    wait_until(new_becomes_leader)
     leader = env.storage_controller.get_leader()
     assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
 
@@ -2624,7 +2624,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
     env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
     env.storage_controller.node_drain(attached.id)
 
-    wait_until(10, 0.5, attached_is_draining)
+    wait_until(attached_is_draining)
 
     attached.restart()
 
@@ -2646,7 +2646,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
         env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
 
     # allow for small delay between actually having cancelled and being able reconfigure again
-    wait_until(4, 0.5, reconfigure_node_again)
+    wait_until(reconfigure_node_again)
 
 
 def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
@@ -2691,7 +2691,7 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
                 ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
             )
 
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
         # can detach from the old pageserver, which will happen once the failpoint completes.
@@ -2775,7 +2775,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
             def has_hit_compaction_failpoint():
                 assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
 
-            wait_until(10, 1, has_hit_compaction_failpoint)
+            wait_until(has_hit_compaction_failpoint)
 
             # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
             # after incrementing generation and attaching the new location
@@ -2794,7 +2794,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
             # before it reaches this point.  The timeout is because the AttachedStale transition includes
             # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
             # make progress.
-            wait_until(60, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint, timeout=60)
 
             # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
             origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
@@ -2917,7 +2917,7 @@ def test_storage_controller_proxy_during_migration(
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             # This request should be routed to whichever pageserver holds the highest generation
             tenant_info = env.storage_controller.pageserver_api().tenant_status(
@@ -2934,7 +2934,7 @@ def test_storage_controller_proxy_during_migration(
                 # We expect request to land on the origin
                 assert tenant_info["generation"] == 1
 
-            wait_until(10, 1, long_migration_metric_published)
+            wait_until(long_migration_metric_published)
 
             # Eventually migration completes
             env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
@@ -3113,7 +3113,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             env.storage_controller.pageserver_api().timeline_delete(
                 tenant_id=tenant_id, timeline_id=timeline_id
@@ -3182,7 +3182,7 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             timeline_id = TimelineId.generate()
             env.storage_controller.pageserver_api().timeline_create(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 3991bd7061..b16dc54c24 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -431,8 +431,6 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
 
         # Let the controller reach the failpoint
         wait_until(
-            10,
-            1,
             lambda: env.storage_controller.assert_log_contains(
                 'failpoint "shard-split-post-remote-sleep": sleeping'
             ),
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d37eeb1e6e..7d4f66d044 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -56,4 +56,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(60, 0.5, check_that_changes_propagated)
+            wait_until(check_that_changes_propagated)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1dd46ec3d1..f8f240cfdc 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
     assert not config_path.exists(), "detach did not remove config file"
 
     env.pageserver.tenant_attach(tenant_id)
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(http_client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
 
     env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
     contents_first = config_path.read_text()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 47df3ead70..48e55c1ab1 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -185,21 +185,21 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     deletion = None
 
     try:
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
         # to shut down.
         deletion = Thread(target=start_deletion)
         deletion.start()
 
-        wait_until(10, 1, deletion_has_started_waiting_for_timelines)
+        wait_until(deletion_has_started_waiting_for_timelines)
 
         pageserver_http.configure_failpoints((failpoint, "off"))
 
         creation.join()
         deletion.join()
 
-        wait_until(10, 1, tenant_is_deleted)
+        wait_until(tenant_is_deleted)
     finally:
         creation.join()
         if deletion is not None:
@@ -264,7 +264,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     def hit_initdb_upload_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
-    wait_until(100, 0.1, hit_initdb_upload_failpoint)
+    wait_until(hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
         env.pageserver.assert_log_contains(
@@ -273,7 +273,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
 
     # Wait so that we hit the timeout and the connection is dropped
     # (But timeline creation still continues)
-    wait_until(100, 0.1, creation_connection_timed_out)
+    wait_until(creation_connection_timed_out)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
 
@@ -281,7 +281,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
         def tenant_delete_inner():
             ps_http.tenant_delete(tenant_id)
 
-        wait_until(100, 0.5, tenant_delete_inner)
+        wait_until(tenant_delete_inner)
 
     Thread(target=tenant_delete).start()
 
@@ -290,7 +290,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
-    wait_until(100, 0.1, deletion_arrived)
+    wait_until(deletion_arrived)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8d7ca7bc4e..3f21dc895a 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -212,7 +212,7 @@ def test_tenant_reattach_while_busy(
         nonlocal updates_started, updates_finished, updates_to_perform
 
         # Wait until we have performed some updates
-        wait_until(20, 0.5, lambda: updates_finished > 500)
+        wait_until(lambda: updates_finished > 500)
 
         log.info("Detaching tenant")
         pageserver_http.tenant_detach(tenant_id)
@@ -512,7 +512,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
 
-    wait_until(10, 0.5, found_broken)
+    wait_until(found_broken)
 
     client.tenant_detach(env.initial_tenant)
 
@@ -524,7 +524,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(broken) == 0 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_cleaned_up)
+    wait_until(found_cleaned_up)
 
     env.pageserver.tenant_attach(env.initial_tenant)
 
@@ -536,4 +536,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(active) == 1 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_active)
+    wait_until(found_active)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index bf6120aa0a..df53a98e92 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -298,11 +298,7 @@ def test_tenant_relocation(
         destination_ps.tenant_attach(tenant_id)
 
         # wait for tenant to finish attaching
-        wait_until(
-            number_of_iterations=10,
-            interval=1,
-            func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
-        )
+        wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active"))
 
         check_timeline_attached(
             destination_http,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8b733da0c6..713f89c60f 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         _, last_offset = wait_until(
-            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
@@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         wait_until(
-            10,
-            1.0,
             lambda: env.pageserver.assert_log_contains(
                 f"at failpoint {failpoint}", offset=last_offset
             ),
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 72183f5778..4c26b64d22 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -77,4 +77,4 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         assert tasks_started == tasks_ended
         assert tasks_panicked is None or int(tasks_panicked) == 0
 
-    wait_until(10, 0.2, assert_tasks_finish)
+    wait_until(assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 158c3fddb0..d31901b384 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -330,7 +330,7 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
         assert len(tenants) == 1
         assert all(t["state"]["slug"] != "Attaching" for t in tenants)
 
-    wait_until(10, 0.2, not_attaching)
+    wait_until(not_attaching)
 
     tenants = client.tenant_list()
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 8d3ddf7e54..6b27c41d1c 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
@@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bc2e048f69..5a1e493bbe 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -227,8 +227,8 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
         assert timeline_offloaded_logged(leaf_timeline_id)
 
-    wait_until(30, 1, leaf_offloaded)
-    wait_until(30, 1, parent_offloaded)
+    wait_until(leaf_offloaded)
+    wait_until(parent_offloaded)
 
     # Offloaded child timelines should still prevent deletion
     with pytest.raises(
@@ -331,7 +331,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
         ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
         assert timeline_offloaded_api(child_timeline_id)
 
-    wait_until(30, 1, child_offloaded)
+    wait_until(child_offloaded)
 
     assert timeline_offloaded_api(child_timeline_id)
     assert not timeline_offloaded_api(root_timeline_id)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 155709e106..fbece68367 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -21,7 +21,6 @@ from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
     many_small_layers_tenant_config,
-    poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
-    # TODO: review whether this wait_until is actually necessary, we do an await() internally
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert not timeline_path.exists()
 
@@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
     assert exc.value.status_code == 404
 
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, parent_timeline_id
-        ),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id)
 
     # Check that we didn't pick up the timeline again after restart.
     # See https://github.com/neondatabase/neon/issues/3560
@@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
     ps_http.configure_failpoints((failpoint, "return"))
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     # These failpoints are earlier than background task is spawned.
     # so they result in api request failure.
     if failpoint in (
@@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             tenant_id=env.initial_tenant,
             timeline_id=timeline_id,
             expected_state="Broken",
-            iterations=iterations,
+            iterations=40,
         )
 
         reason = timeline_info["state"]["Broken"]["reason"]
@@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         env.pageserver.stop()
         env.pageserver.start()
 
-        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+        wait_until_tenant_active(ps_http, env.initial_tenant)
 
         if failpoint == "timeline-delete-before-index-deleted-at":
             # We crashed before persisting this to remote storage, need to retry delete request
             timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
         else:
             # Pageserver should've resumed deletion after restart.
-            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, iterations=iterations
-            )
+            wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state
         ps_http.configure_failpoints((failpoint, "off"))
 
-        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, iterations=iterations
-        )
+        timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
 
     # Check remote is empty
     if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach(
 
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
-    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id)
 
     timelines = ps_http.timeline_list(tenant_id=tenant_id)
     assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
@@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     # Wait for tenant to finish loading.
     wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
 
-    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert (
         not leaf_timeline_path.exists()
@@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         )
 
     # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server incosistency and check twice.
+    # Assume it is mock server incosistency and check a few times.
     wait_until(
-        2,
-        0.5,
         lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
+        timeout=2,
     )
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
@@ -544,7 +525,7 @@ def test_concurrent_timeline_delete_stuck_on(
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
-        wait_until(50, 0.1, first_call_hit_failpoint)
+        wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0)
 
         # make the second call and assert behavior
         log.info("second call start")
@@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     def hit_failpoint():
         env.pageserver.assert_log_contains(at_failpoint_log_message)
 
-    wait_until(50, 0.1, hit_failpoint)
+    wait_until(hit_failpoint, interval=0.1)
 
     # we log this error if a client hangs up
     # might as well use it as another indicator that the test works
@@ -623,7 +604,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     def got_hangup_log_message():
         env.pageserver.assert_log_contains(hangup_log_message)
 
-    wait_until(50, 0.1, got_hangup_log_message)
+    wait_until(got_hangup_log_message, interval=0.1)
 
     # check that the timeline is still present
     ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
@@ -635,10 +616,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
         env.pageserver.assert_log_contains(message)
 
-    wait_until(50, 0.1, first_request_finished)
+    wait_until(first_request_finished, interval=0.1)
 
     # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
 
 
 def test_timeline_delete_works_for_remote_smoke(
@@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke(
 
     # for some reason the check above doesnt immediately take effect for the below.
     # Assume it is mock server inconsistency and check twice.
-    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
+    wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
@@ -753,15 +734,13 @@ def test_delete_orphaned_objects(
 
     env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(env.initial_tenant, timeline_id)
     timeline_info = wait_until_timeline_state(
         pageserver_http=ps_http,
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach(
         )
     )
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(tenant_id, timeline_id)
 
     timeline_info = wait_until_timeline_state(
@@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach(
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach(
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     # delete should be resumed
-    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
     assert not tenant_path.exists()
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 9c7e851ba8..2c3ee38bae 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from(
     )
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
     # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
     # as there is always "PREV_LSN: invalid" for "before"
@@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
 
     # delete the timelines to confirm detach actually worked
     client.timeline_delete(env.initial_tenant, after)
-    wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, after)
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
 
 def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
@@ -973,17 +973,17 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(lambda: at_waiting_on_gate_close(offset))
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
-            assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
+            assert wait_until(is_deleted), f"unimplemented mode {mode}"
 
             # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
@@ -1115,11 +1115,11 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             if mode == "delete_reparentable_timeline":
                 assert first_branch is not None
@@ -1127,7 +1127,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
                     env.initial_tenant, first_branch
                 )
                 victim_http.configure_failpoints((pausepoint, "off"))
-                wait_until(10, 1.0, first_branch_gone)
+                wait_until(first_branch_gone)
             elif mode == "create_reparentable_timeline":
                 first_branch = create_reparentable_timeline()
                 victim_http.configure_failpoints((pausepoint, "off"))
@@ -1271,11 +1271,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
@@ -1456,7 +1456,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
     # other tests take the "detach? reparent complete", but this only hits
     # "complete".
     http.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline)
 
     http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
 
@@ -1518,7 +1518,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
         with ThreadPoolExecutor(max_workers=1) as pool:
             detach = pool.submit(detach_and_get_stuck)
 
-            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+            offset = wait_until(request_processing_noted_in_log)
 
             # make this named fn tor more clear failure test output logging
             def pausepoint_hit_with_gc_paused() -> LogCursor:
@@ -1529,11 +1529,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
                 )
                 return at
 
-            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+            offset = wait_until(pausepoint_hit_with_gc_paused)
 
             delete_detached()
 
-            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+            wait_timeline_detail_404(http, env.initial_tenant, detached)
 
             http.configure_failpoints((failpoint, "off"))
 
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 5a5ca3290a..7605e1f758 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
-    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch)
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_active_line)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4528bc6180..95bf9106cd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
 
     # Wait for the tenant to be loaded
     client = env.pageserver.http_client()
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active"))
 
     assert_physical_size_invariants(
         get_physical_size_values(env, env.initial_tenant, new_timeline_id),
@@ -433,7 +429,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder
             get_physical_size_values(env, env.initial_tenant, new_timeline_id),
         )
 
-    wait_until(10, 1, check)
+    wait_until(check)
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int
     def condition():
         assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
 
-    wait_until(5, 1.0, condition)
+    wait_until(condition)
 
 
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
@@ -768,7 +764,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         assert "Active" in set(get_tenant_states().values())
 
     # One tenant should activate, then get stuck in their logical size calculation
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
     time.sleep(5)
@@ -836,13 +832,13 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     def all_active():
         assert all(s == "Active" for s in get_tenant_states().values())
 
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     # Final control check: restarting with no failpoints at all results in all tenants coming active
     # without being prompted by client I/O
     env.pageserver.stop()
     env.pageserver.start()
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
@@ -856,7 +852,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
     )
 
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     detach_tenant_id = list(
         [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
@@ -881,7 +877,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
     # we detached)
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
     assert len(get_tenant_states()) == n_tenants - 2
 
 
@@ -908,7 +904,7 @@ def delete_lazy_activating(
         try:
             # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
             # hang because of our failpoint blocking activation.
-            wait_until(10, 1, shutting_down)
+            wait_until(shutting_down)
         finally:
             log.info("Clearing failpoint")
             pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
@@ -1030,13 +1026,13 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         log.info(f"{states}")
         assert len(states["Active"]) == 1
 
-    wait_until(10, 1, one_is_active)
+    wait_until(one_is_active)
 
     def other_is_attaching():
         states = get_tenant_states()
         assert len(states["Attaching"]) == 1
 
-    wait_until(10, 1, other_is_attaching)
+    wait_until(other_is_attaching)
 
     def eager_tenant_is_active():
         resp = client.tenant_status(eager_tenant)
@@ -1053,7 +1049,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         },
         lazy=False,
     )
-    wait_until(10, 1, eager_tenant_is_active)
+    wait_until(eager_tenant_is_active)
 
     other_is_attaching()
 
@@ -1096,7 +1092,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         resp = client.tenant_status(env.initial_tenant)
         assert resp["state"]["slug"] == "Active"
 
-    wait_until(10, 1, initial_tenant_is_active)
+    wait_until(initial_tenant_is_active)
 
     # even though the initial tenant is now active, because it was startup time
     # attach, it will consume the only permit because logical size calculation
@@ -1119,7 +1115,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         assert resp["state"]["slug"] == "Attaching"
 
     # paused logical size calculation of env.initial_tenant is keeping it attaching
-    wait_until(10, 1, lazy_tenant_is_attaching)
+    wait_until(lazy_tenant_is_attaching)
 
     for _ in range(5):
         lazy_tenant_is_attaching()
@@ -1132,10 +1128,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     if activation_method == "endpoint":
         with env.endpoints.create_start("main", tenant_id=lazy_tenant):
             # starting up the endpoint should make it jump the queue
-            wait_until(10, 1, lazy_tenant_is_active)
+            wait_until(lazy_tenant_is_active)
     elif activation_method == "branch":
         env.create_timeline("second_branch", lazy_tenant)
-        wait_until(10, 1, lazy_tenant_is_active)
+        wait_until(lazy_tenant_is_active)
     elif activation_method == "delete":
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 8fa33b81a9..23d4f23cdb 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2136,7 +2136,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
         # Check that on source no segment files are present
         assert src_sk.list_segments(tenant_id, timeline_id) == []
 
-    wait_until(60, 1, evicted_on_source)
+    wait_until(evicted_on_source, timeout=60)
 
     # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
     # destination should import the control file only & go into evicted mode immediately
@@ -2155,7 +2155,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
 
     # This should be fast, it is a wait_until because eviction state is updated
     # in the background wrt pull_timeline.
-    wait_until(10, 0.1, evicted_on_destination)
+    wait_until(evicted_on_destination, timeout=1.0, interval=0.1)
 
     # Delete the timeline on the source, to prove that deletion works on an
     # evicted timeline _and_ that the final compute test is really not using
@@ -2178,7 +2178,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
         n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
         assert n_evicted == 0
 
-    wait_until(10, 1, unevicted_on_dest)
+    wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
 
 
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
@@ -2606,10 +2606,10 @@ def test_s3_eviction(
         assert n_evicted  # make mypy happy
         assert int(n_evicted) == n_timelines
 
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted, timeout=30)
     # restart should preserve the metric value
     sk.stop().start()
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted)
     # and endpoint start should reduce is
     endpoints[0].start()
 
@@ -2618,7 +2618,7 @@ def test_s3_eviction(
         assert n_evicted  # make mypy happy
         assert int(n_evicted) < n_timelines
 
-    wait_until(60, 0.5, one_unevicted)
+    wait_until(one_unevicted)
 
 
 # Test resetting uploaded partial segment state.
@@ -2666,7 +2666,7 @@ def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
         if isinstance(eviction_state, str) and eviction_state == "Present":
             raise Exception("eviction didn't happen yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
     # it must have uploaded something
     uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
     log.info(f"uploaded segments before reset: {uploaded_segs}")
@@ -2763,7 +2763,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
 
         raise Exception("Partial segment not uploaded yet")
 
-    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    source_partial_segment = wait_until(source_partial_segment_uploaded)
     log.info(
         f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
     )
@@ -2787,7 +2787,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         if evictions is None or evictions == 0:
             raise Exception("Eviction did not happen on source safekeeper yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
 
     endpoint.start(safekeepers=[2, 3])
 
@@ -2804,7 +2804,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
     )
 
     endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
-    wait_until(15, 1, new_partial_segment_uploaded)
+    wait_until(new_partial_segment_uploaded)
 
     log.info(
         f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
@@ -2833,4 +2833,4 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         if unevictions is None or unevictions == 0:
             raise Exception("Uneviction did not happen on source safekeeper yet")
 
-    wait_until(10, 1, unevicted)
+    wait_until(unevicted)
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 294f86ffa7..d22a900c59 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -97,7 +97,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
                     str(safekeeper.id) in exception_string
                 ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
 
-    wait_until(60, 0.5, all_sks_in_wareceiver_state)
+    wait_until(all_sks_in_wareceiver_state, timeout=30)
 
     stopped_safekeeper = env.safekeepers[-1]
     stopped_safekeeper_id = stopped_safekeeper.id
@@ -124,7 +124,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
                         str(safekeeper.id) in exception_string
                     ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
 
-    wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
+    wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30)
 
 
 def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):

From bd0936919885f130e3d6aedf42ba3ca7047c56e4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 2 Dec 2024 11:50:22 +0000
Subject: [PATCH 024/117] storcon: add metric for AZ scheduling violations
 (#9949)

## Problem

We can't easily tell how far the state of shards is from their AZ
preferences. This can be a cause of performance issues, so it's
important for diagnosability that we can tell easily if there are
significant numbers of shards that aren't running in their preferred AZ.

Related: https://github.com/neondatabase/cloud/issues/15413

## Summary of changes

- In reconcile_all, count shards that are scheduled into the wrong AZ
(if they have a preference), and publish it as a prometheus gauge.
- Also calculate a statistic for how many shards wanted to reconcile but
couldn't.

This is clearly a lazy calculation: reconcile all only runs
periodically. But that's okay: shards in the wrong AZ is something that
only matters if it stays that way for some period of time.
---
 storage_controller/src/metrics.rs |  6 ++++++
 storage_controller/src/service.rs | 32 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index a1f7bc2457..6d5885eba6 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we make an optimization change to a tenant's scheduling
     pub(crate) storage_controller_schedule_optimization: measured::Counter,
 
+    /// How many shards are not scheduled into their preferred AZ
+    pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
+
+    /// How many shards would like to reconcile but were blocked by concurrency limits
+    pub(crate) storage_controller_pending_reconciles: measured::Gauge,
+
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 636ccf11a1..631fdb4923 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6016,14 +6016,33 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
+        // This function is an efficient place to update lazy statistics, since we are walking
+        // all tenants.
+        let mut pending_reconciles = 0;
+        let mut az_violations = 0;
+
         let mut reconciles_spawned = 0;
         for shard in tenants.values_mut() {
+            // Accumulate scheduling statistics
+            if let (Some(attached), Some(preferred)) =
+                (shard.intent.get_attached(), shard.preferred_az())
+            {
+                let node_az = nodes
+                    .get(attached)
+                    .expect("Nodes exist if referenced")
+                    .get_availability_zone_id();
+                if node_az != preferred {
+                    az_violations += 1;
+                }
+            }
+
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
                 // callers like reconcile_all_now do not incorrectly get the impression
                 // that the system is in a quiescent state.
                 reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                pending_reconciles += 1;
                 continue;
             }
 
@@ -6031,9 +6050,22 @@ impl Service {
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
+            } else if shard.delayed_reconcile {
+                // Shard wanted to reconcile but for some reason couldn't.
+                pending_reconciles += 1;
             }
         }
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_az_violation
+            .set(az_violations as i64);
+
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pending_reconciles
+            .set(pending_reconciles as i64);
+
         reconciles_spawned
     }
 

From cd1d2d19968b197d60f122454e83cfe485af1f7d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 12:29:57 +0000
Subject: [PATCH 025/117] fix(proxy): forward notifications from authentication
 (#9948)

Fixes https://github.com/neondatabase/cloud/issues/20973.

This refactors `connect_raw` in order to return direct access to the
delayed notices.

I cannot find a way to test this with psycopg2 unfortunately, although
testing it with psql does return the expected results.
---
 libs/pq_proto/src/lib.rs                      |  6 +++
 .../postgres-protocol2/src/message/backend.rs |  4 ++
 .../proxy/tokio-postgres2/src/cancel_token.rs |  8 ++--
 libs/proxy/tokio-postgres2/src/client.rs      | 13 +++---
 libs/proxy/tokio-postgres2/src/config.rs      |  6 +--
 libs/proxy/tokio-postgres2/src/connect.rs     | 42 +++++++++++++++----
 libs/proxy/tokio-postgres2/src/connect_raw.rs | 37 +++++++++-------
 libs/proxy/tokio-postgres2/src/lib.rs         |  5 ++-
 proxy/src/compute.rs                          | 38 ++++++++++++-----
 proxy/src/proxy/mod.rs                        |  8 ++--
 proxy/src/proxy/tests/mod.rs                  |  8 ++--
 11 files changed, 117 insertions(+), 58 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 4b0331999d..43dfbc22a4 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -565,6 +565,8 @@ pub enum BeMessage<'a> {
     /// Batch of interpreted, shard filtered WAL records,
     /// ready for the pageserver to ingest
     InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
+
+    Raw(u8, &'a [u8]),
 }
 
 /// Common shorthands.
@@ -754,6 +756,10 @@ impl BeMessage<'_> {
     /// one more buffer.
     pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
         match message {
+            BeMessage::Raw(code, data) => {
+                buf.put_u8(*code);
+                write_body(buf, |b| b.put_slice(data))
+            }
             BeMessage::AuthenticationOk => {
                 buf.put_u8(b'R');
                 write_body(buf, |buf| {
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 356d142f3f..33d77fc252 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -541,6 +541,10 @@ impl NoticeResponseBody {
     pub fn fields(&self) -> ErrorFields<'_> {
         ErrorFields { buf: &self.storage }
     }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.storage
+    }
 }
 
 pub struct NotificationResponseBody {
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index b949bf358f..a10e8bf5c3 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -10,10 +10,10 @@ use tokio::net::TcpStream;
 /// connection.
 #[derive(Clone)]
 pub struct CancelToken {
-    pub(crate) socket_config: Option<SocketConfig>,
-    pub(crate) ssl_mode: SslMode,
-    pub(crate) process_id: i32,
-    pub(crate) secret_key: i32,
+    pub socket_config: Option<SocketConfig>,
+    pub ssl_mode: SslMode,
+    pub process_id: i32,
+    pub secret_key: i32,
 }
 
 impl CancelToken {
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 96200b71e7..a7cd53afc3 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -138,7 +138,7 @@ impl InnerClient {
 }
 
 #[derive(Clone)]
-pub(crate) struct SocketConfig {
+pub struct SocketConfig {
     pub host: Host,
     pub port: u16,
     pub connect_timeout: Option<Duration>,
@@ -152,7 +152,7 @@ pub(crate) struct SocketConfig {
 pub struct Client {
     inner: Arc<InnerClient>,
 
-    socket_config: Option<SocketConfig>,
+    socket_config: SocketConfig,
     ssl_mode: SslMode,
     process_id: i32,
     secret_key: i32,
@@ -161,6 +161,7 @@ pub struct Client {
 impl Client {
     pub(crate) fn new(
         sender: mpsc::UnboundedSender<Request>,
+        socket_config: SocketConfig,
         ssl_mode: SslMode,
         process_id: i32,
         secret_key: i32,
@@ -172,7 +173,7 @@ impl Client {
                 buffer: Default::default(),
             }),
 
-            socket_config: None,
+            socket_config,
             ssl_mode,
             process_id,
             secret_key,
@@ -188,10 +189,6 @@ impl Client {
         &self.inner
     }
 
-    pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
-        self.socket_config = Some(socket_config);
-    }
-
     /// Creates a new prepared statement.
     ///
     /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
@@ -412,7 +409,7 @@ impl Client {
     /// connection associated with this client.
     pub fn cancel_token(&self) -> CancelToken {
         CancelToken {
-            socket_config: self.socket_config.clone(),
+            socket_config: Some(self.socket_config.clone()),
             ssl_mode: self.ssl_mode,
             process_id: self.process_id,
             secret_key: self.secret_key,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 969c20ba47..26124b38ef 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -2,6 +2,7 @@
 
 use crate::connect::connect;
 use crate::connect_raw::connect_raw;
+use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
@@ -485,14 +486,11 @@ impl Config {
         connect(tls, self).await
     }
 
-    /// Connects to a PostgreSQL database over an arbitrary stream.
-    ///
-    /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
     pub async fn connect_raw<S, T>(
         &self,
         stream: S,
         tls: T,
-    ) -> Result<(Client, Connection<S, T::Stream>), Error>
+    ) -> Result<RawConnection<S, T::Stream>, Error>
     where
         S: AsyncRead + AsyncWrite + Unpin,
         T: TlsConnect<S>,
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 7517fe0cde..98067d91f9 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,13 +1,16 @@
 use crate::client::SocketConfig;
+use crate::codec::BackendMessage;
 use crate::config::{Host, TargetSessionAttrs};
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
+use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
 use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use postgres_protocol2::message::backend::Message;
 use std::io;
 use std::task::Poll;
 use tokio::net::TcpStream;
+use tokio::sync::mpsc;
 
 pub async fn connect<T>(
     mut tls: T,
@@ -60,7 +63,36 @@ where
     T: TlsConnect<TcpStream>,
 {
     let socket = connect_socket(host, port, config.connect_timeout).await?;
-    let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
+    let RawConnection {
+        stream,
+        parameters,
+        delayed_notice,
+        process_id,
+        secret_key,
+    } = connect_raw(socket, tls, config).await?;
+
+    let socket_config = SocketConfig {
+        host: host.clone(),
+        port,
+        connect_timeout: config.connect_timeout,
+    };
+
+    let (sender, receiver) = mpsc::unbounded_channel();
+    let client = Client::new(
+        sender,
+        socket_config,
+        config.ssl_mode,
+        process_id,
+        secret_key,
+    );
+
+    // delayed notices are always sent as "Async" messages.
+    let delayed = delayed_notice
+        .into_iter()
+        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
+        .collect();
+
+    let mut connection = Connection::new(stream, delayed, parameters, receiver);
 
     if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
         let rows = client.simple_query_raw("SHOW transaction_read_only");
@@ -102,11 +134,5 @@ where
         }
     }
 
-    client.set_socket_config(SocketConfig {
-        host: host.clone(),
-        port,
-        connect_timeout: config.connect_timeout,
-    });
-
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 80677af969..9c6f1a2552 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -3,27 +3,26 @@ use crate::config::{self, AuthKeys, Config, ReplicationMode};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
-use crate::{Client, Connection, Error};
+use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
 use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
-use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
 use postgres_protocol2::message::frontend;
-use std::collections::{HashMap, VecDeque};
+use std::collections::HashMap;
 use std::io;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 
 pub struct StartupStream<S, T> {
     inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
     buf: BackendMessages,
-    delayed: VecDeque<BackendMessage>,
+    delayed_notice: Vec<NoticeResponseBody>,
 }
 
 impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
@@ -78,11 +77,19 @@ where
     }
 }
 
+pub struct RawConnection<S, T> {
+    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    pub parameters: HashMap<String, String>,
+    pub delayed_notice: Vec<NoticeResponseBody>,
+    pub process_id: i32,
+    pub secret_key: i32,
+}
+
 pub async fn connect_raw<S, T>(
     stream: S,
     tls: T,
     config: &Config,
-) -> Result<(Client, Connection<S, T::Stream>), Error>
+) -> Result<RawConnection<S, T::Stream>, Error>
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: TlsConnect<S>,
@@ -97,18 +104,20 @@ where
             },
         ),
         buf: BackendMessages::empty(),
-        delayed: VecDeque::new(),
+        delayed_notice: Vec::new(),
     };
 
     startup(&mut stream, config).await?;
     authenticate(&mut stream, config).await?;
     let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
 
-    let (sender, receiver) = mpsc::unbounded_channel();
-    let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
-    let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
-
-    Ok((client, connection))
+    Ok(RawConnection {
+        stream: stream.inner,
+        parameters,
+        delayed_notice: stream.delayed_notice,
+        process_id,
+        secret_key,
+    })
 }
 
 async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
@@ -347,9 +356,7 @@ where
                     body.value().map_err(Error::parse)?.to_string(),
                 );
             }
-            Some(msg @ Message::NoticeResponse(_)) => {
-                stream.delayed.push_back(BackendMessage::Async(msg))
-            }
+            Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body),
             Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
             Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
             Some(_) => return Err(Error::unexpected_message()),
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 72ba8172b2..57c639a7de 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -1,9 +1,10 @@
 //! An asynchronous, pipelined, PostgreSQL client.
-#![warn(rust_2018_idioms, clippy::all, missing_docs)]
+#![warn(rust_2018_idioms, clippy::all)]
 
 pub use crate::cancel_token::CancelToken;
-pub use crate::client::Client;
+pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
+pub use crate::connect_raw::RawConnection;
 pub use crate::connection::Connection;
 use crate::error::DbError;
 pub use crate::error::Error;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 2abe88ac88..b689b97a21 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
 use rustls::crypto::ring;
@@ -13,6 +14,7 @@ use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -277,6 +279,8 @@ pub(crate) struct PostgresConnection {
     pub(crate) cancel_closure: CancelClosure,
     /// Labels for proxy's metrics.
     pub(crate) aux: MetricsAuxInfo,
+    /// Notices received from compute after authenticating
+    pub(crate) delayed_notice: Vec<NoticeResponseBody>,
 
     _guage: NumDbConnectionsGuard<'static>,
 }
@@ -322,10 +326,19 @@ impl ConnCfg {
 
         // connect_raw() will not use TLS if sslmode is "disable"
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        let connection = self.0.connect_raw(stream, tls).await?;
         drop(pause);
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
-        let stream = connection.stream.into_inner();
+
+        let RawConnection {
+            stream,
+            parameters,
+            delayed_notice,
+            process_id,
+            secret_key,
+        } = connection;
+
+        tracing::Span::current().record("pid", tracing::field::display(process_id));
+        let stream = stream.into_inner();
 
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
@@ -334,18 +347,23 @@ impl ConnCfg {
             self.0.get_ssl_mode()
         );
 
-        // This is very ugly but as of now there's no better way to
-        // extract the connection parameters from tokio-postgres' connection.
-        // TODO: solve this problem in a more elegant manner (e.g. the new library).
-        let params = connection.parameters;
-
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
-        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]);
+        let cancel_closure = CancelClosure::new(
+            socket_addr,
+            CancelToken {
+                socket_config: None,
+                ssl_mode: self.0.get_ssl_mode(),
+                process_id,
+                secret_key,
+            },
+            vec![],
+        );
 
         let connection = PostgresConnection {
             stream,
-            params,
+            params: parameters,
+            delayed_notice,
             cancel_closure,
             aux,
             _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 956036d29d..af97fb3d71 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -384,11 +384,13 @@ pub(crate) async fn prepare_client_connection<P>(
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
 
+    // Forward all deferred notices to the client.
+    for notice in &node.delayed_notice {
+        stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
+    }
+
     // Forward all postgres connection params to the client.
-    // Right now the implementation is very hacky and inefficent (ideally,
-    // we don't need an intermediate hashmap), but at least it should be correct.
     for (name, value) in &node.params {
-        // TODO: Theoretically, this could result in a big pile of params...
         stream.write_message_noflush(&Be::ParameterStatus {
             name: name.as_bytes(),
             value: value.as_bytes(),
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2c2c2964b6..15be6c9724 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")

From c18716bb3fdb7044588b40a9a8dc0491ac82b4e4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 2 Dec 2024 12:46:07 +0000
Subject: [PATCH 026/117] CI(replication-tests): fix notifications about
 replication-tests failures (#9950)

## Problem

`if: ${{ github.event.schedule }}` gets skipped if a previous step has
failed, but we want to run the step for both `success` and `failure`

## Summary of changes
- Add `!cancelled()` to notification step if-condition, to skip only
cancelled jobs
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ea8fee80c2..7621d72f64 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -249,7 +249,7 @@ jobs:
 
     # Post both success and failure to the Slack channel
     - name: Post to a Slack channel
-      if: ${{ github.event.schedule }}
+      if: ${{ github.event.schedule && !cancelled() }}
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream

From 1b605716362138fd415f14c84747bbe434ee05f9 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 16:38:12 +0100
Subject: [PATCH 027/117] proxy: Create Elasticache credentials provider lazily
 (#9967)

## Problem

The credentials providers tries to connect to AWS STS even when we use
plain Redis connections.

## Summary of changes

* Construct the CredentialsProvider only when needed ("irsa").
---
 proxy/src/bin/proxy.rs         | 49 +++++---------------------------
 proxy/src/redis/elasticache.rs | 51 ++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b772a987ee..c929b97d78 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -3,14 +3,6 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use anyhow::bail;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::meta::region::RegionProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::provider_config::ProviderConfig;
-use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
@@ -314,39 +306,7 @@ async fn main() -> anyhow::Result<()> {
     };
     info!("Using region: {}", args.aws_region);
 
-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
-    let provider_conf =
-        ProviderConfig::without_region().with_region(region_provider.region().await);
-    let aws_credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-    };
-    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
-        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
-            args.redis_cluster_name,
-            args.redis_user_id,
-        ),
-        aws_credentials_provider,
-    ));
+    // TODO: untangle the config args
     let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
         ("plain", redis_url) => match redis_url {
             None => {
@@ -361,7 +321,12 @@ async fn main() -> anyhow::Result<()> {
                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
                     host.to_string(),
                     port,
-                    elasticache_credentials_provider.clone(),
+                    elasticache::CredentialsProvider::new(
+                        args.aws_region,
+                        args.redis_cluster_name,
+                        args.redis_user_id,
+                    )
+                    .await,
                 ),
             ),
             (None, None) => {
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index d118c8f412..bf6dde9332 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -1,6 +1,14 @@
+use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
     self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
@@ -45,12 +53,45 @@ pub struct CredentialsProvider {
 }
 
 impl CredentialsProvider {
-    pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
-        CredentialsProvider {
-            config,
-            credentials_provider,
-        }
+    pub async fn new(
+        aws_region: String,
+        redis_cluster_name: Option<String>,
+        redis_user_id: Option<String>,
+    ) -> Arc<CredentialsProvider> {
+        let region_provider =
+            RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone()));
+        let provider_conf =
+            ProviderConfig::without_region().with_region(region_provider.region().await);
+        let aws_credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+        };
+        Arc::new(CredentialsProvider {
+            config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id),
+            credentials_provider: aws_credentials_provider,
+        })
     }
+
     pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
         let aws_credentials = self
             .credentials_provider

From fa909c27fc23be35e889b591a4eae028bd43434d Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 2 Dec 2024 19:10:44 +0300
Subject: [PATCH 028/117] Update consensus protocol spec (#9607)

The spec was written for the buggy protocol which we had before the one
more similar to Raft was implemented. Update the spec with what we
currently have.

ref https://github.com/neondatabase/neon/issues/8699
---
 safekeeper/spec/.gitignore                    |    3 +
 safekeeper/spec/MCProposerAcceptorStatic.tla  |   31 +
 safekeeper/spec/ProposerAcceptorConsensus.cfg |   34 -
 safekeeper/spec/ProposerAcceptorConsensus.tla |  363 ----
 safekeeper/spec/ProposerAcceptorStatic.tla    |  449 +++++
 safekeeper/spec/modelcheck.sh                 |   49 +
 .../MCProposerAcceptorStatic_p2_a3_t2_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l3.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a3_t4_l4.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a5_t2_l2.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t3_l3.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t4_l3.cfg  |   16 +
 safekeeper/spec/readme.md                     |   12 +
 ...c_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log |   63 +
 ...c_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log |   69 +
 ...c_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log |   72 +
 ...c_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log | 1466 +++++++++++++++++
 ...c_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log | 1374 +++++++++++++++
 ...c_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log |   89 +
 20 files changed, 3797 insertions(+), 397 deletions(-)
 create mode 100644 safekeeper/spec/.gitignore
 create mode 100644 safekeeper/spec/MCProposerAcceptorStatic.tla
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.cfg
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.tla
 create mode 100644 safekeeper/spec/ProposerAcceptorStatic.tla
 create mode 100755 safekeeper/spec/modelcheck.sh
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
 create mode 100644 safekeeper/spec/readme.md
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log

diff --git a/safekeeper/spec/.gitignore b/safekeeper/spec/.gitignore
new file mode 100644
index 0000000000..7233153039
--- /dev/null
+++ b/safekeeper/spec/.gitignore
@@ -0,0 +1,3 @@
+*TTrace*
+*.toolbox/
+states/
diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla
new file mode 100644
index 0000000000..be3d99c697
--- /dev/null
+++ b/safekeeper/spec/MCProposerAcceptorStatic.tla
@@ -0,0 +1,31 @@
+---- MODULE MCProposerAcceptorStatic ----
+EXTENDS TLC, ProposerAcceptorStatic
+
+\* Augments the spec with model checking constraints.
+
+\* For model checking.
+CONSTANTS
+  max_entries, \* model constraint: max log entries acceptor/proposer can hold
+  max_term \* model constraint: max allowed term
+
+ASSUME max_entries \in Nat /\ max_term \in Nat
+
+\* Model space constraint.
+StateConstraint == \A p \in proposers:
+                    /\ prop_state[p].term <= max_term
+                    /\ Len(prop_state[p].wal) <= max_entries
+\* Sets of proposers and acceptors are symmetric because we don't take any
+\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
+\* ...)
+ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
+
+\* enforce order of the vars in the error trace with ALIAS
+\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
+\* as of writing this.
+Alias == [
+           prop_state |-> prop_state,
+           acc_state |-> acc_state,
+           committed |-> committed
+         ]
+
+====
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg
deleted file mode 100644
index 989c86e47d..0000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.cfg
+++ /dev/null
@@ -1,34 +0,0 @@
-\* MV CONSTANT declarations
-CONSTANT NULL = NULL
-CONSTANTS
-p1 = p1
-p2 = p2
-p3 = p3
-a1 = a1
-a2 = a2
-a3 = a3
-\* MV CONSTANT definitions
-CONSTANT
-proposers = {p1, p2}
-acceptors = {a1, a2, a3}
-\* SYMMETRY definition
-SYMMETRY perms
-\* CONSTANT definitions
-CONSTANT
-max_term = 3
-CONSTANT
-max_entries = 3
-\* INIT definition
-INIT
-Init
-\* NEXT definition
-NEXT
-Next
-\* INVARIANT definition
-INVARIANT
-TypeOk
-ElectionSafety
-LogIsMonotonic
-LogSafety
-CommittedNotOverwritten
-CHECK_DEADLOCK FALSE
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla
deleted file mode 100644
index e5f0bb270f..0000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.tla
+++ /dev/null
@@ -1,363 +0,0 @@
----- MODULE ProposerAcceptorConsensus ----
-
-\* Differences from current implementation:
-\* - unified not-globally-unique epoch & term (node_id)
-\* Simplifications:
-\* - instant message delivery
-\* - feedback is not modeled separately, commit_lsn is updated directly
-
-EXTENDS Integers, Sequences, FiniteSets, TLC
-
-VARIABLES
-  prop_state, \* prop_state[p] is state of proposer p
-  acc_state, \* acc_state[a] is state of acceptor a
-  commit_lsns \* map of acceptor -> commit_lsn
-
-CONSTANT
-  acceptors,
-  proposers,
-  max_entries, \* model constraint: max log entries acceptor/proposer can hold
-  max_term \* model constraint: max allowed term
-
-CONSTANT NULL
-
-ASSUME max_entries \in Nat /\ max_term \in Nat
-
-\* For specifying symmetry set in manual cfg file, see
-\* https://github.com/tlaplus/tlaplus/issues/404
-perms == Permutations(proposers) \union Permutations(acceptors)
-
-\********************************************************************************
-\* Helpers
-\********************************************************************************
-
-Maximum(S) ==
-  (*************************************************************************)
-  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
-  (* maximum of those numbers, or -1 if S is empty.                        *)
-  (*************************************************************************)
-  IF S = {} THEN -1
-            ELSE CHOOSE n \in S : \A m \in S : n \geq m
-
-\* minimum of numbers in the set, error if set is empty
-Minimum(S) ==
-  CHOOSE min \in S : \A n \in S : min <= n
-
-\* Min of two numbers
-Min(a, b) == IF a < b THEN a ELSE b
-
-\* Set of values of function f. XXX is there a such builtin?
-FValues(f) == {f[a] : a \in DOMAIN f}
-
-\* Sort of 0 for functions
-EmptyF == [x \in {} |-> 42]
-IsEmptyF(f) == DOMAIN f = {}
-
-\* Next entry proposer p will push to acceptor a or NULL.
-NextEntry(p, a) ==
-  IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN
-    CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a]
-  ELSE
-    NULL
-
-
-\*****************
-
-NumAccs == Cardinality(acceptors)
-
-\* does acc_set form the quorum?
-Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
-\* all quorums of acceptors
-Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
-
-\* flush_lsn of acceptor a.
-FlushLsn(a) == Len(acc_state[a].wal)
-
-
-\********************************************************************************
-\* Type assertion
-\********************************************************************************
-\* Defining sets of all possible tuples and using them in TypeOk in usual
-\* all-tuples constructor is not practical because such definitions force
-\* TLC to enumerate them, while they are are horribly enormous
-\* (TLC screams "Attempted to construct a set with too many elements").
-\* So instead check types manually.
-TypeOk ==
-    /\ \A p \in proposers:
-      /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"}
-      \* in campaign proposer sends RequestVote and waits for acks;
-      \* in leader he is elected
-      /\ prop_state[p].state \in {"campaign", "leader"}
-      \* 0..max_term should be actually Nat in the unbounded model, but TLC won't
-      \* swallow it
-      /\ prop_state[p].term \in 0..max_term
-      \* votes received
-      /\ \A voter \in DOMAIN prop_state[p].votes:
-         /\ voter \in acceptors
-         /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries]
-      /\ prop_state[p].donor_epoch \in 0..max_term
-      \* wal is sequence of just <lsn, epoch of author> records
-      /\ \A i \in DOMAIN prop_state[p].wal:
-           prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-      \* Following implementation, we skew the original Aurora meaning of this;
-      \* here it is lsn of highest definitely committed record as set by proposer
-      \* when it is elected; it doesn't change since then
-      /\ prop_state[p].vcl \in 0..max_entries
-      \* map of acceptor -> next lsn to send
-      /\ \A a \in DOMAIN prop_state[p].next_send_lsn:
-         /\ a \in acceptors
-         /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1)
-    /\ \A a \in acceptors:
-      /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"}
-      /\ acc_state[a].term \in 0..max_term
-      /\ acc_state[a].epoch \in 0..max_term
-      /\ \A i \in DOMAIN acc_state[a].wal:
-           acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-    /\ \A a \in DOMAIN commit_lsns:
-      /\ a \in acceptors
-      /\ commit_lsns[a] \in 0..max_entries
-
-\********************************************************************************
-\* Initial
-\********************************************************************************
-
-Init ==
-  /\ prop_state = [p \in proposers |-> [
-                      state |-> "campaign",
-                      term |-> 1,
-                      votes |-> EmptyF,
-                      donor_epoch |-> 0,
-                      vcl |-> 0,
-                      wal |-> << >>,
-                      next_send_lsn |-> EmptyF
-                  ]]
-  /\ acc_state = [a \in acceptors |-> [
-                    \* there will be no leader in this term, 1 is the first real
-                    term |-> 0,
-                    epoch |-> 0,
-                    wal |-> << >>
-                 ]]
-  /\ commit_lsns = [a \in acceptors |-> 0]
-
-
-\********************************************************************************
-\* Actions
-\********************************************************************************
-
-\* Proposer loses all state.
-\* For simplicity (and to reduct state space), we assume it immediately gets
-\* current state from quorum q of acceptors determining the term he will request
-\* to vote for.
-RestartProposer(p, q) ==
-  /\ Quorum(q)
-  /\ LET
-       new_term == Maximum({acc_state[a].term : a \in q}) + 1
-     IN
-       /\ new_term <= max_term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
-                                           ![p].term = new_term,
-                                           ![p].votes = EmptyF,
-                                           ![p].donor_epoch = 0,
-                                           ![p].vcl = 0,
-                                           ![p].wal = << >>,
-                                           ![p].next_send_lsn = EmptyF]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-\* Acceptor a immediately votes for proposer p.
-Vote(p, a) ==
- /\ prop_state[p].state = "campaign"
- /\ acc_state[a].term < prop_state[p].term \* main voting condition
- /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
- /\ LET
-      vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)]
-    IN
-      prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)]
- /\ UNCHANGED <<commit_lsns>>
-
-
-\* Proposer p gets elected.
-BecomeLeader(p) ==
-  /\ prop_state[p].state = "campaign"
-  /\ Quorum(DOMAIN prop_state[p].votes)
-  /\ LET
-       max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)})
-       max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch}
-       donor == CHOOSE dv \in DOMAIN prop_state[p].votes :
-                     /\ prop_state[p].votes[dv].epoch = max_epoch
-                     /\ \A v \in max_epoch_votes:
-                       prop_state[p].votes[dv].flush_lsn >= v.flush_lsn
-       max_vote == prop_state[p].votes[donor]
-       \* Establish lsn to stream from for voters.
-       \* At some point it seemed like we can regard log as correct and only
-       \* append to it if has in the max_epoch, however TLC showed that's not
-       \* the case; we must always stream since first not matching record.
-       next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1]
-     IN
-          \* we fetch log from the most advanced node (this is separate
-          \* roundtrip), make sure node is still on one term with us
-       /\ acc_state[donor].term = prop_state[p].term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
-                                           \* fetch the log from donor
-                                           ![p].wal = acc_state[donor].wal,
-                                           ![p].donor_epoch = max_epoch,
-                                           ![p].vcl = max_vote.flush_lsn,
-                                           ![p].next_send_lsn = next_send_lsn]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* acceptor a learns about elected proposer p's term.
-UpdateTerm(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term < prop_state[p].term
-  /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
-  /\ UNCHANGED <<prop_state, commit_lsns>>
-
-
-\* Acceptor a which didn't participate in voting connects to elected proposer p
-\* and p sets the streaming point
-HandshakeWithLeader(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term = prop_state[p].term
-  /\ a \notin DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1)
-     IN
-       prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn]
-  /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Append new log entry to elected proposer
-NewEntry(p) ==
-  /\ prop_state[p].state = "leader"
-  /\ Len(prop_state[p].wal) < max_entries \* model constraint
-  /\ LET
-       new_lsn == IF Len(prop_state[p].wal) = 0 THEN
-                    prop_state[p].vcl + 1
-                  ELSE
-                    \* lsn of last record + 1
-                    prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1
-       new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term]
-     IN
-       /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Write entry new_e to log wal, rolling back all higher entries if e is different.
-\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update
-\* the epoch. Truncate log in this case as well, as we might have correct <= vcl
-\* part and some outdated entries behind it which we want to purge before
-\* declaring us as recovered. Another way to accomplish this (in previous commit)
-\* is wait for first-entry-from-new-epoch before bumping it.
-WriteEntry(wal, new_e, bump_epoch) ==
-  (new_e.lsn :> new_e) @@
-  \* If wal has entry with such lsn and it is different, truncate all higher log.
-  IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e)
-     \/ bump_epoch THEN
-    SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn)
-  ELSE
-    wal
-
-
-\* Try to transfer entry from elected proposer p to acceptor a
-TransferEntry(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ prop_state[p].term = acc_state[a].term
-  /\ a \in DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_e == NextEntry(p, a)
-     IN
-       /\ next_e /= NULL
-       /\ LET
-            \* Consider bumping epoch if getting this entry recovers the acceptor,
-            \* that is, we reach first record behind VCL.
-            new_epoch ==
-              IF /\ acc_state[a].epoch < prop_state[p].term
-                 /\ next_e.lsn >= prop_state[p].vcl
-              THEN
-                prop_state[p].term
-              ELSE
-                acc_state[a].epoch
-            \* Also check whether this entry allows to advance commit_lsn and
-            \* if so, bump it where possible. Modeling this as separate action
-            \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3,
-            \* so act immediately.
-            entry_owners == {o \in acceptors:
-                               /\ o /= a
-                               \* only recovered acceptors advance commit_lsn
-                               /\ acc_state[o].epoch = prop_state[p].term
-                               /\ next_e \in FValues(acc_state[o].wal)} \cup {a}
-          IN
-            /\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch),
-                                              ![a].epoch = new_epoch]
-            /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] =
-                                                  prop_state[p].next_send_lsn[a] + 1]
-            /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term
-                                 /\ Quorum(entry_owners)
-                              THEN
-                                [acc \in acceptors |->
-                                   IF /\ acc \in entry_owners
-                                      /\ next_e.lsn > commit_lsns[acc]
-                                   THEN
-                                     next_e.lsn
-                                   ELSE
-                                       commit_lsns[acc]]
-                              ELSE
-                                commit_lsns
-
-
-\*******************************************************************************
-\* Final spec
-\*******************************************************************************
-
-Next ==
-  \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
-  \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
-  \/ \E p \in proposers: BecomeLeader(p)
-  \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
-  \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a)
-  \/ \E p \in proposers: NewEntry(p)
-  \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a)
-
-Spec == Init /\ [][Next]_<<prop_state, acc_state, commit_lsns>>
-
-
-\********************************************************************************
-\* Invariants
-\********************************************************************************
-
-\* we don't track history, but this property is fairly convincing anyway
-ElectionSafety ==
-  \A p1, p2 \in proposers:
-    (/\ prop_state[p1].state = "leader"
-     /\ prop_state[p2].state = "leader"
-     /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
-
-LogIsMonotonic ==
-  \A a \in acceptors:
-    \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal:
-      (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn
-                  /\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch)
-
-\* Main invariant: log under commit_lsn must match everywhere.
-LogSafety ==
-  \A a1 \in acceptors: \A a2 \in acceptors:
-    LET
-      common_len == Min(commit_lsns[a1], commit_lsns[a2])
-    IN
-      SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len)
-
-\* Next record we are going to push to acceptor must never overwrite committed
-\* different record.
-CommittedNotOverwritten ==
-  \A p \in proposers: \A a \in acceptors:
-    (/\ prop_state[p].state = "leader"
-     /\ prop_state[p].term = acc_state[a].term
-     /\ a \in DOMAIN prop_state[p].next_send_lsn) =>
-       LET
-         next_e == NextEntry(p, a)
-       IN
-         (next_e /= NULL) =>
-          ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e))
-
-
-====
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla
new file mode 100644
index 0000000000..b2d2f005db
--- /dev/null
+++ b/safekeeper/spec/ProposerAcceptorStatic.tla
@@ -0,0 +1,449 @@
+---- MODULE ProposerAcceptorStatic ----
+
+(*
+  The protocol is very similar to Raft. The key differences are:
+  - Leaders (proposers) are separated from storage nodes (acceptors), which has
+    been already an established way to think about Paxos.
+  - We don't want to stamp each log record with term, so instead carry around
+    term histories which are sequences of <term, LSN where term begins> pairs.
+    As a bonus (and subtlety) this allows the proposer to commit entries from
+    previous terms without writing new records -- if acceptor's log is caught
+    up, update of term history on it updates last_log_term as well.
+*)
+
+\* Model simplifications:
+\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not
+\*   delayed, so we don't attempt to truncate WAL when the same wp already appended something
+\*   on the acceptor since common point had been calculated (this should be rejected).
+\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
+
+\* Some ideas how to break it to play around to get a feeling:
+\* - replace Quorums with BadQuorums.
+\* - remove 'don't commit entries from previous terms separately' rule in
+\*   CommitEntries and observe figure 8 from the raft paper.
+\*   With p2a3t4l4 32 steps error was found in 1h on 80 cores.
+
+EXTENDS Integers, Sequences, FiniteSets, TLC
+
+VARIABLES
+  prop_state, \* prop_state[p] is state of proposer p
+  acc_state, \* acc_state[a] is state of acceptor a
+  committed, \* bag (set) of ever committed <<term, lsn>> entries
+  elected_history \* counter for elected terms, see TypeOk for details
+
+CONSTANT
+  acceptors,
+  proposers
+
+CONSTANT NULL
+
+\********************************************************************************
+\* Helpers
+\********************************************************************************
+
+Maximum(S) ==
+  (*************************************************************************)
+  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
+  (* maximum of those numbers, or -1 if S is empty.                        *)
+  (*************************************************************************)
+  IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m
+
+\* minimum of numbers in the set, error if set is empty
+Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n
+
+\* Min of two numbers
+Min(a, b) == IF a < b THEN a ELSE b
+
+\* Sort of 0 for functions
+EmptyF == [x \in {} |-> 42]
+IsEmptyF(f) == DOMAIN f = {}
+
+\* Set of values (image) of the function f. Apparently no such builtin.
+Range(f) == {f[x] : x \in DOMAIN f}
+
+\* If key k is in function f, map it using l, otherwise insert v. Returns the
+\* updated function.
+Upsert(f, k, v, l(_)) ==
+    LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN
+        (k :> new_val) @@ f
+
+\*****************
+
+NumAccs == Cardinality(acceptors)
+
+\* does acc_set form the quorum?
+Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
+\* all quorums of acceptors
+Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
+
+\* For substituting Quorums and seeing what happens.
+BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2)
+BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)}
+
+\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
+FlushLsn(a) == Len(acc_state[a].wal) + 1
+
+\* Typedefs. Note that TLA+ Nat includes zero.
+Terms == Nat
+Lsns == Nat
+
+\********************************************************************************
+\* Type assertion
+\********************************************************************************
+\* Defining sets of all possible tuples and using them in TypeOk in usual
+\* all-tuples constructor is not practical because such definitions force
+\* TLC to enumerate them, while they are are horribly enormous
+\* (TLC screams "Attempted to construct a set with too many elements").
+\* So instead check types manually.
+
+
+\* Term history is a sequence of <term, LSN where term begins> pairs.
+IsTermHistory(th) ==
+    \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns
+
+IsWal(w) ==
+    \A i \in DOMAIN w:
+        /\ i \in Lsns
+        /\ w[i] \in Terms
+
+TypeOk ==
+    /\ \A p \in proposers:
+        \* '_' in field names hinders pretty printing
+        \* https://github.com/tlaplus/tlaplus/issues/1051
+        \* so use camel case.
+        /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"}
+        \* In campaign proposer sends RequestVote and waits for acks;
+        \* in leader he is elected.
+        /\ prop_state[p].state \in {"campaign", "leader"}
+        \* term for which it will campaign, or won term in leader state
+        /\ prop_state[p].term \in Terms
+        \* votes received
+        /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors
+        /\ \A vote \in Range(prop_state[p].votes):
+               /\ IsTermHistory(vote.termHistory)
+               /\ vote.flushLsn \in Lsns
+        \* Proposer's term history. Empty while proposer is in "campaign".
+        /\ IsTermHistory(prop_state[p].termHistory)
+        \* In the model we identify WAL entries only by <term, LSN> pairs
+        \* without additional unique id, which is enough for its purposes.
+        \* It means that with term history fully modeled wal becomes
+        \* redundant as it can be computed from term history + WAL length.
+        \* However, we still keep it here and at acceptors as explicit sequence
+        \* where index is LSN and value is the term to avoid artificial mapping to
+        \* figure out real entries. It shouldn't bloat model much because this
+        \* doesn't increase number of distinct states.
+        /\ IsWal(prop_state[p].wal)
+        \* Map of acceptor -> next lsn to send. It is set when truncate_wal is
+        \* done so sending entries is allowed only after that. In the impl TCP
+        \* ensures this ordering.
+        /\ \A a \in DOMAIN prop_state[p].nextSendLsn:
+               /\ a \in acceptors
+               /\ prop_state[p].nextSendLsn[a] \in Lsns
+    /\ \A a \in acceptors:
+           /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
+           /\ acc_state[a].term \in Terms
+           /\ IsTermHistory(acc_state[a].termHistory)
+           /\ IsWal(acc_state[a].wal)
+    /\ \A c \in committed:
+           /\ c.term \in Terms
+           /\ c.lsn \in Lsns
+    \* elected_history is a retrospective map of term -> number of times it was
+    \* elected, for use in ElectionSafetyFull invariant. For static spec it is
+    \* fairly convincing that it holds, but with membership change it is less
+    \* trivial. And as we identify log entries only with <term, lsn>, importance
+    \* of it is quite high as violation of log safety might go undetected if
+    \* election safety is violated. Note though that this is not always the
+    \* case, i.e. you can imagine (and TLC should find) schedule where log
+    \* safety violation is still detected because two leaders with the same term
+    \* commit histories which are different in previous terms, so it is not that
+    \* crucial. Plus if spec allows ElectionSafetyFull violation, likely
+    \* ElectionSafety will also be violated in some schedules. But neither it
+    \* should bloat the model too much.
+    /\ \A term \in DOMAIN elected_history:
+           /\ term \in Terms
+           /\ elected_history[term] \in Nat
+
+\********************************************************************************
+\* Initial
+\********************************************************************************
+
+Init ==
+    /\ prop_state = [p \in proposers |-> [
+                        state |-> "campaign",
+                        term |-> 1,
+                        votes |-> EmptyF,
+                        termHistory |-> << >>,
+                        wal |-> << >>,
+                        nextSendLsn |-> EmptyF
+                    ]]
+    /\ acc_state = [a \in acceptors |-> [
+                       \* There will be no leader in zero term, 1 is the first
+                       \* real.
+                       term |-> 0,
+                       \* Again, leader in term 0 doesn't exist, but we initialize
+                       \* term histories with it to always have common point in
+                       \* them. Lsn is 1 because TLA+ sequences are indexed from 1
+                       \* (we don't want to truncate WAL out of range).
+                       termHistory |-> << [term |-> 0, lsn |-> 1] >>,
+                       wal |-> << >>
+                   ]]
+    /\ committed = {}
+    /\ elected_history = EmptyF
+
+
+\********************************************************************************
+\* Actions
+\********************************************************************************
+
+\* Proposer loses all state.
+\* For simplicity (and to reduct state space), we assume it immediately gets
+\* current state from quorum q of acceptors determining the term he will request
+\* to vote for.
+RestartProposer(p, q) ==
+    /\ Quorum(q)
+    /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
+           /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
+                                               ![p].term = new_term,
+                                               ![p].votes = EmptyF,
+                                               ![p].termHistory = << >>,
+                                               ![p].wal = << >>,
+                                               ![p].nextSendLsn = EmptyF]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Term history of acceptor a's WAL: the one saved truncated to contain only <=
+\* local FlushLsn entries.
+AcceptorTermHistory(a) ==
+    SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
+
+\* Acceptor a immediately votes for proposer p.
+Vote(p, a) ==
+    /\ prop_state[p].state = "campaign"
+    /\ acc_state[a].term < prop_state[p].term \* main voting condition
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ LET
+           vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)]
+       IN
+           prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes]
+    /\ UNCHANGED <<committed, elected_history>>
+
+
+\* Get lastLogTerm from term history th.
+LastLogTerm(th) == th[Len(th)].term
+
+\* Proposer p gets elected.
+BecomeLeader(p) ==
+  /\ prop_state[p].state = "campaign"
+  /\ Quorum(DOMAIN prop_state[p].votes)
+  /\ LET
+         \* Find acceptor with the highest <last_log_term, lsn> vote.
+         max_vote_acc ==
+              CHOOSE a \in DOMAIN prop_state[p].votes:
+                  LET v == prop_state[p].votes[a]
+                  IN \A v2 \in Range(prop_state[p].votes):
+                         /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory)
+                         /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn)
+         max_vote == prop_state[p].votes[max_vote_acc]
+         prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
+     IN
+         \* We copy all log preceding proposer's term from the max vote node so
+         \* make sure it is still on one term with us. This is a model
+         \* simplification which can be removed, in impl we fetch WAL on demand
+         \* from safekeeper which has it later. Note though that in case of on
+         \* demand fetch we must check on donor not only term match, but that
+         \* truncate_wal had already been done (if it is not max_vote_acc).
+         /\ acc_state[max_vote_acc].term = prop_state[p].term
+         /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
+                                             ![p].termHistory = prop_th,
+                                             ![p].wal = acc_state[max_vote_acc].wal
+                          ]
+         /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
+         /\ UNCHANGED <<acc_state, committed>>
+
+
+\* Acceptor a learns about elected proposer p's term. In impl it matches to
+\* VoteRequest/VoteResponse exchange when leader is already elected and is not
+\* interested in the vote result.
+UpdateTerm(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term < prop_state[p].term
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ UNCHANGED <<prop_state, committed, elected_history>>
+
+\* Find highest common point (LSN of the first divergent record) in the logs of
+\* proposer p and acceptor a. Returns <term, lsn> of the highest common point.
+FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
+    LET
+        \* First find index of the highest common term.
+        \* It must exist because we initialize th with <0, 1>.
+        last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term})
+        last_common_term == prop_th[last_common_idx].term
+        \* Now find where it ends at both prop and acc and take min. End of term
+        \* is the start of the next unless it is the last one; there it is
+        \* flush_lsn in case of acceptor. In case of proposer it is the current
+        \* writing position, but it can't be less than flush_lsn, so we
+        \* take flush_lsn.
+        acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn
+        prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn
+    IN
+        [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
+
+\* Elected proposer p immediately truncates WAL (and term history) of acceptor a
+\* before starting streaming. Establishes nextSendLsn for a.
+\*
+\* In impl this happens at each reconnection, here we also allow to do it multiple times.
+TruncateWal(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ LET
+           hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+           next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn
+       IN
+           \* Acceptor persists full history immediately; reads adjust it to the
+           \* really existing wal with AcceptorTermHistory.
+           /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory,
+                                             \* note: SubSeq is inclusive, hence -1.
+                                             ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1)
+                           ]
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* Append new log entry to elected proposer
+NewEntry(p) ==
+    /\ prop_state[p].state = "leader"
+    /\ LET
+           \* entry consists only of term, index serves as LSN.
+           new_entry == prop_state[p].term
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Immediately append next entry from elected proposer to acceptor a.
+AppendEntry(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal
+    /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
+    /\ LET
+           send_lsn == prop_state[p].nextSendLsn[a]
+           entry == prop_state[p].wal[send_lsn]
+           \* Since message delivery is instant we don't check that send_lsn follows
+           \* the last acc record, it must always be true.
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1]
+           /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* LSN where elected proposer p starts writing its records.
+PropStartLsn(p) ==
+    IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
+
+\* Proposer p commits all entries it can using quorum q. Note that unlike
+\* will62794/logless-reconfig this allows to commit entries from previous terms
+\* (when conditions for that are met).
+CommitEntries(p, q) ==
+    /\ prop_state[p].state = "leader"
+    /\ \A a \in q:
+           /\ acc_state[a].term = prop_state[p].term
+             \* nextSendLsn existence means TruncateWal has happened, it ensures
+             \* acceptor's WAL (and FlushLsn) are from proper proposer's history.
+             \* Alternatively we could compare LastLogTerm here, but that's closer to
+             \* what we do in the impl (we check flushLsn in AppendResponse, but
+             \* AppendRequest is processed only if HandleElected handling was good).
+           /\ a \in DOMAIN prop_state[p].nextSendLsn
+    \* Now find the LSN present on all the quorum.
+    /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
+           \* This is the basic Raft rule of not committing entries from previous
+           \* terms except along with current term entry (commit them only when
+           \* quorum recovers, i.e. last_log_term on it reaches leader's term).
+           /\ quorum_lsn >= PropStartLsn(p)
+           /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)}
+           /\ UNCHANGED <<prop_state, acc_state, elected_history>>
+
+\*******************************************************************************
+\* Final spec
+\*******************************************************************************
+
+Next ==
+    \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
+    \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
+    \/ \E p \in proposers: BecomeLeader(p)
+    \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
+    \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
+    \/ \E p \in proposers: NewEntry(p)
+    \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
+    \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q)
+
+Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>
+
+
+\********************************************************************************
+\* Invariants
+\********************************************************************************
+
+\* Lighter version of ElectionSafetyFull which doesn't require elected_history.
+ElectionSafety ==
+    \A p1, p2 \in proposers:
+        (/\ prop_state[p1].state = "leader"
+         /\ prop_state[p2].state = "leader"
+         /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
+
+\* Single term must never be elected more than once.
+ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1
+
+\* Log is expected to be monotonic by <term, lsn> comparison. This is not true
+\* in variants of multi Paxos, but in Raft (and here) it is.
+LogIsMonotonic ==
+    \A a \in acceptors:
+        \A i, j \in DOMAIN acc_state[a].wal:
+            (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j])
+
+\* Main invariant: If two entries are committed at the same LSN, they must be
+\* the same entry.
+LogSafety ==
+    \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2)
+
+
+\********************************************************************************
+\* Invariants which don't need to hold, but useful for playing/debugging.
+\********************************************************************************
+
+\* Limits term of elected proposers
+MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2)
+
+MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2
+
+\* Limits max number of committed entries. That way we can check that we'are
+\* actually committing something.
+MaxCommitLsn == Cardinality(committed) < 2
+
+\* How many records with different terms can be removed in single WAL
+\* truncation.
+MaxTruncatedTerms ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+                hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+                truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+                truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns}
+            IN
+                Cardinality(truncated_records_terms) < 2
+
+\* Check that TruncateWal never deletes committed record.
+\* It might seem that this should an invariant, but it is not.
+\* With 5 nodes, it is legit to truncate record which had been
+\* globally committed: e.g. nodes abc can commit record of term 1 in
+\* term 3, and after that leader of term 2 can delete such record
+\* on d. On 10 cores TLC can find such a trace in ~7 hours.
+CommittedNotTruncated ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+               hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+               truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+               truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns}
+            IN
+               \A r \in truncated_records: r \notin committed
+
+====
diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh
new file mode 100755
index 0000000000..21ead7dad8
--- /dev/null
+++ b/safekeeper/spec/modelcheck.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
+# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
+CONFIG=$1
+SPEC=$2
+
+MEM=7G
+TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar"
+
+mkdir -p "tlc-results"
+CONFIG_FILE=$(basename -- "$CONFIG")
+outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
+outfile="tlc-results/$outfilename"
+touch $outfile
+
+# Save some info about the run.
+GIT_REV=`git rev-parse --short HEAD`
+INFO=`uname -a`
+
+# First for Linux, second for Mac.
+CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1')
+CPUCORESLinux=`nproc`
+CPUNAMEMac=`sysctl -n machdep.cpu.brand_string`
+CPUCORESMac=`sysctl -n machdep.cpu.thread_count`
+
+echo "git revision: $GIT_REV" >> $outfile
+echo "Platform: $INFO" >> $outfile
+echo "CPU Info Linux: $CPUNAMELinux" >> $outfile
+echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile
+echo "CPU Info Mac: $CPUNAMEMac" >> $outfile
+echo "CPU Cores Mac: $CPUCORESMac" >> $outfile
+echo "Spec: $SPEC" >> $outfile
+echo "Config: $CONFIG" >> $outfile
+echo "----" >> $outfile
+cat $CONFIG >> $outfile
+echo "" >> $outfile
+echo "----" >> $outfile
+echo "" >> $outfile
+
+# see
+# https://lamport.azurewebsites.net/tla/current-tools.pdf
+# for TLC options.
+# OffHeapDiskFPSet is the optimal fingerprint set implementation
+# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
+#
+# Add -simulate to run in infinite simulation mode.
+java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
+  -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
new file mode 100644
index 0000000000..c06109c601
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
@@ -0,0 +1,19 @@
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
new file mode 100644
index 0000000000..5d10fa960f
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
@@ -0,0 +1,19 @@
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
new file mode 100644
index 0000000000..8ba8ce95a4
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
new file mode 100644
index 0000000000..4763a34ec4
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
new file mode 100644
index 0000000000..ebf4724633
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
new file mode 100644
index 0000000000..bb77350c58
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
new file mode 100644
index 0000000000..9a5e142f99
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 4
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/readme.md b/safekeeper/spec/readme.md
new file mode 100644
index 0000000000..ec2649d87d
--- /dev/null
+++ b/safekeeper/spec/readme.md
@@ -0,0 +1,12 @@
+The specifications, models and results of running of them of the compute <->
+safekeepers consensus algorithm for committing WAL on the fleet of safekeepers.
+Following Paxos parlance, compute which writes WAL is called (WAL) proposer here
+and safekeepers which persist it are called (WAL) acceptors.
+
+Directory structure:
+- Use modelcheck.sh to run TLC.
+- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models.
+- Other .tla files are the actual specs.
+
+Structure is partially borrowed from
+[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it.
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
new file mode 100644
index 0000000000..768722b1eb
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
@@ -0,0 +1,63 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
+----
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:44:18)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.9E-9
+  based on the actual fingerprints:  val = 4.1E-10
+922134 states generated, 61249 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 31.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3).
+Finished in 11s at (2024-11-06 13:44:28)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
new file mode 100644
index 0000000000..ae3ba98da6
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
@@ -0,0 +1,69 @@
+git revision: bcbff084a
+Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux
+CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U
+CPU Cores Linux: 10
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
+----
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef)
+Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-15 12:09:59)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00.
+Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue.
+Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue.
+Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue.
+Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue.
+Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue.
+Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.9E-6
+  based on the actual fingerprints:  val = 6.9E-7
+36896322 states generated, 2623542 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 39.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 05min 14s at (2024-11-15 12:15:13)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
new file mode 100644
index 0000000000..46f21cee72
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
@@ -0,0 +1,72 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:03:52)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55.
+Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue.
+Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue.
+Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue.
+Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue.
+Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue.
+Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue.
+Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue.
+Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue.
+Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue.
+Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue.
+Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.5E-4
+  based on the actual fingerprints:  val = 7.9E-5
+270257170 states generated, 18005639 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 47.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 10min 25s at (2024-11-06 13:14:17)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
new file mode 100644
index 0000000000..c7cc853af0
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
@@ -0,0 +1,1466 @@
+# Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled.
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 14:20:26)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29.
+Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue.
+Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue.
+Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue.
+Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue.
+Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue.
+Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue.
+Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue.
+Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue.
+Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue.
+Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue.
+Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue.
+Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue.
+Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue.
+Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue.
+Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue.
+Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue.
+Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue.
+Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue.
+Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue.
+Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue.
+Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue.
+Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue.
+Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue.
+Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue.
+Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue.
+Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue.
+Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue.
+Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue.
+Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue.
+Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue.
+Checkpointing of run states/24-11-06-14-20-25.868
+Checkpointing completed at (2024-11-06 14:50:32)
+Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue.
+Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue.
+Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue.
+Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue.
+Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue.
+Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue.
+Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue.
+Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue.
+Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue.
+Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue.
+Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue.
+Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue.
+Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue.
+Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue.
+Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue.
+Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue.
+Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue.
+Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue.
+Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue.
+Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue.
+Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue.
+Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue.
+Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue.
+Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue.
+Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue.
+Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue.
+Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue.
+Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue.
+Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue.
+Error: Invariant LogSafety is violated.
+Error: The behavior up to this point is:
+State 1: <Initial predicate>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 2: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 3: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 4: <Vote(p1,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 5: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 6: <Vote(p2,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 7: <TruncateWal(p1,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 8: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 9: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 10: <TruncateWal(p2,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> (a1 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 11: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 12: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 13: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 14: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 15: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 16: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 3) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 17: <RestartProposer(p1,{a1, a3}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 18: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 19: <Vote(p2,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 20: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 21: <TruncateWal(p2,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 22: <TruncateWal(p2,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 23: <AppendEntry(p2,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 24: <CommitEntries(p2,{a2, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 25: <Vote(p1,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 26: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 27: <TruncateWal(p1,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 28: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 29: <AppendEntry(p1,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 30: <TruncateWal(p1,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 31: <AppendEntry(p1,a1) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 32: <CommitEntries(p1,{a1, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]}
+
+1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue.
+The depth of the complete state graph search is 35.
+Finished in 58min 19s at (2024-11-06 15:18:45)
+Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
new file mode 100644
index 0000000000..8248240ded
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
@@ -0,0 +1,1374 @@
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 84 and seed -1069171980999686913 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 62544] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-6542850091824737097/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-6542850091824737097/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-6542850091824737097/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-6542850091824737097/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-6542850091824737097/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 15:30:45)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 15:30:48.
+Progress(20) at 2024-11-06 15:30:51: 956,386 states generated (956,386 s/min), 134,121 distinct states found (134,121 ds/min), 57,996 states left on queue.
+Progress(27) at 2024-11-06 15:31:51: 30,048,294 states generated (29,091,908 s/min), 3,778,849 distinct states found (3,644,728 ds/min), 1,463,715 states left on queue.
+Progress(28) at 2024-11-06 15:32:51: 59,092,248 states generated (29,043,954 s/min), 7,282,332 distinct states found (3,503,483 ds/min), 2,750,944 states left on queue.
+Progress(29) at 2024-11-06 15:33:51: 88,333,136 states generated (29,240,888 s/min), 10,694,325 distinct states found (3,411,993 ds/min), 3,955,744 states left on queue.
+Progress(29) at 2024-11-06 15:34:51: 117,708,994 states generated (29,375,858 s/min), 14,000,885 distinct states found (3,306,560 ds/min), 5,067,487 states left on queue.
+Progress(30) at 2024-11-06 15:35:51: 146,847,667 states generated (29,138,673 s/min), 17,407,824 distinct states found (3,406,939 ds/min), 6,258,337 states left on queue.
+Progress(30) at 2024-11-06 15:36:51: 176,211,801 states generated (29,364,134 s/min), 20,626,933 distinct states found (3,219,109 ds/min), 7,302,661 states left on queue.
+Progress(31) at 2024-11-06 15:37:51: 205,665,438 states generated (29,453,637 s/min), 23,877,622 distinct states found (3,250,689 ds/min), 8,361,004 states left on queue.
+Progress(31) at 2024-11-06 15:38:51: 234,757,357 states generated (29,091,919 s/min), 27,246,813 distinct states found (3,369,191 ds/min), 9,511,916 states left on queue.
+Progress(31) at 2024-11-06 15:39:51: 264,154,436 states generated (29,397,079 s/min), 30,383,069 distinct states found (3,136,256 ds/min), 10,494,238 states left on queue.
+Progress(31) at 2024-11-06 15:40:51: 293,638,121 states generated (29,483,685 s/min), 33,498,433 distinct states found (3,115,364 ds/min), 11,429,812 states left on queue.
+Progress(32) at 2024-11-06 15:41:51: 323,039,991 states generated (29,401,870 s/min), 36,709,338 distinct states found (3,210,905 ds/min), 12,463,752 states left on queue.
+Progress(32) at 2024-11-06 15:42:51: 352,081,458 states generated (29,041,467 s/min), 39,979,938 distinct states found (3,270,600 ds/min), 13,531,461 states left on queue.
+Progress(32) at 2024-11-06 15:43:51: 381,472,323 states generated (29,390,865 s/min), 43,147,359 distinct states found (3,167,421 ds/min), 14,513,444 states left on queue.
+Progress(32) at 2024-11-06 15:44:51: 410,911,764 states generated (29,439,441 s/min), 46,200,793 distinct states found (3,053,434 ds/min), 15,418,951 states left on queue.
+Progress(32) at 2024-11-06 15:45:51: 440,514,627 states generated (29,602,863 s/min), 49,210,279 distinct states found (3,009,486 ds/min), 16,263,879 states left on queue.
+Progress(33) at 2024-11-06 15:46:51: 470,070,180 states generated (29,555,553 s/min), 52,317,535 distinct states found (3,107,256 ds/min), 17,200,875 states left on queue.
+Progress(33) at 2024-11-06 15:47:51: 499,387,268 states generated (29,317,088 s/min), 55,489,376 distinct states found (3,171,841 ds/min), 18,196,719 states left on queue.
+Progress(33) at 2024-11-06 15:48:51: 528,308,354 states generated (28,921,086 s/min), 58,716,400 distinct states found (3,227,024 ds/min), 19,225,822 states left on queue.
+Progress(33) at 2024-11-06 15:49:51: 557,626,508 states generated (29,318,154 s/min), 61,861,039 distinct states found (3,144,639 ds/min), 20,172,391 states left on queue.
+Progress(33) at 2024-11-06 15:50:51: 587,011,551 states generated (29,385,043 s/min), 64,911,520 distinct states found (3,050,481 ds/min), 21,068,246 states left on queue.
+Progress(33) at 2024-11-06 15:51:51: 616,469,665 states generated (29,458,114 s/min), 67,862,377 distinct states found (2,950,857 ds/min), 21,888,495 states left on queue.
+Progress(33) at 2024-11-06 15:52:51: 646,037,901 states generated (29,568,236 s/min), 70,774,601 distinct states found (2,912,224 ds/min), 22,642,487 states left on queue.
+Progress(33) at 2024-11-06 15:53:51: 675,679,292 states generated (29,641,391 s/min), 73,753,124 distinct states found (2,978,523 ds/min), 23,459,982 states left on queue.
+Progress(34) at 2024-11-06 15:54:51: 705,213,119 states generated (29,533,827 s/min), 76,751,356 distinct states found (2,998,232 ds/min), 24,319,315 states left on queue.
+Progress(34) at 2024-11-06 15:55:51: 734,548,637 states generated (29,335,518 s/min), 79,865,504 distinct states found (3,114,148 ds/min), 25,270,867 states left on queue.
+Progress(34) at 2024-11-06 15:56:51: 763,724,351 states generated (29,175,714 s/min), 82,969,406 distinct states found (3,103,902 ds/min), 26,203,099 states left on queue.
+Progress(34) at 2024-11-06 15:57:51: 792,795,916 states generated (29,071,565 s/min), 86,092,913 distinct states found (3,123,507 ds/min), 27,124,641 states left on queue.
+Progress(34) at 2024-11-06 15:58:51: 822,084,221 states generated (29,288,305 s/min), 89,196,548 distinct states found (3,103,635 ds/min), 28,028,058 states left on queue.
+Progress(34) at 2024-11-06 15:59:51: 851,516,510 states generated (29,432,289 s/min), 92,135,078 distinct states found (2,938,530 ds/min), 28,822,750 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:00:51)
+Progress(34) at 2024-11-06 16:00:51: 880,891,436 states generated (29,374,926 s/min), 95,133,622 distinct states found (2,998,544 ds/min), 29,669,470 states left on queue.
+Progress(34) at 2024-11-06 16:01:51: 910,262,536 states generated (29,371,100 s/min), 98,019,631 distinct states found (2,886,009 ds/min), 30,433,293 states left on queue.
+Progress(34) at 2024-11-06 16:02:51: 939,689,255 states generated (29,426,719 s/min), 100,814,884 distinct states found (2,795,253 ds/min), 31,083,132 states left on queue.
+Progress(34) at 2024-11-06 16:03:51: 969,299,651 states generated (29,610,396 s/min), 103,664,772 distinct states found (2,849,888 ds/min), 31,821,093 states left on queue.
+Progress(34) at 2024-11-06 16:04:51: 999,051,292 states generated (29,751,641 s/min), 106,544,287 distinct states found (2,879,515 ds/min), 32,536,946 states left on queue.
+Progress(35) at 2024-11-06 16:05:51: 1,028,690,576 states generated (29,639,284 s/min), 109,444,362 distinct states found (2,900,075 ds/min), 33,326,316 states left on queue.
+Progress(35) at 2024-11-06 16:06:51: 1,058,155,400 states generated (29,464,824 s/min), 112,439,937 distinct states found (2,995,575 ds/min), 34,167,604 states left on queue.
+Progress(35) at 2024-11-06 16:07:51: 1,087,496,744 states generated (29,341,344 s/min), 115,461,649 distinct states found (3,021,712 ds/min), 35,032,974 states left on queue.
+Progress(35) at 2024-11-06 16:08:51: 1,116,663,767 states generated (29,167,023 s/min), 118,482,838 distinct states found (3,021,189 ds/min), 35,902,651 states left on queue.
+Progress(35) at 2024-11-06 16:09:51: 1,145,439,918 states generated (28,776,151 s/min), 121,562,159 distinct states found (3,079,321 ds/min), 36,785,088 states left on queue.
+Progress(35) at 2024-11-06 16:10:51: 1,174,812,354 states generated (29,372,436 s/min), 124,511,721 distinct states found (2,949,562 ds/min), 37,555,204 states left on queue.
+Progress(35) at 2024-11-06 16:11:51: 1,204,150,178 states generated (29,337,824 s/min), 127,579,155 distinct states found (3,067,434 ds/min), 38,425,790 states left on queue.
+Progress(35) at 2024-11-06 16:12:51: 1,233,620,353 states generated (29,470,175 s/min), 130,490,427 distinct states found (2,911,272 ds/min), 39,188,412 states left on queue.
+Progress(35) at 2024-11-06 16:13:51: 1,263,022,331 states generated (29,401,978 s/min), 133,317,160 distinct states found (2,826,733 ds/min), 39,893,070 states left on queue.
+Progress(35) at 2024-11-06 16:14:51: 1,292,411,979 states generated (29,389,648 s/min), 136,229,817 distinct states found (2,912,657 ds/min), 40,666,029 states left on queue.
+Progress(35) at 2024-11-06 16:15:51: 1,321,695,856 states generated (29,283,877 s/min), 139,081,910 distinct states found (2,852,093 ds/min), 41,389,715 states left on queue.
+Progress(35) at 2024-11-06 16:16:51: 1,351,045,560 states generated (29,349,704 s/min), 141,811,662 distinct states found (2,729,752 ds/min), 41,999,267 states left on queue.
+Progress(35) at 2024-11-06 16:17:51: 1,380,677,436 states generated (29,631,876 s/min), 144,516,072 distinct states found (2,704,410 ds/min), 42,579,779 states left on queue.
+Progress(35) at 2024-11-06 16:18:51: 1,410,332,660 states generated (29,655,224 s/min), 147,269,848 distinct states found (2,753,776 ds/min), 43,232,732 states left on queue.
+Progress(35) at 2024-11-06 16:19:51: 1,440,071,594 states generated (29,738,934 s/min), 150,116,683 distinct states found (2,846,835 ds/min), 43,917,859 states left on queue.
+Progress(35) at 2024-11-06 16:20:51: 1,469,737,942 states generated (29,666,348 s/min), 152,881,605 distinct states found (2,764,922 ds/min), 44,594,909 states left on queue.
+Progress(36) at 2024-11-06 16:21:51: 1,499,124,482 states generated (29,386,540 s/min), 155,722,313 distinct states found (2,840,708 ds/min), 45,306,186 states left on queue.
+Progress(36) at 2024-11-06 16:22:51: 1,528,616,635 states generated (29,492,153 s/min), 158,643,911 distinct states found (2,921,598 ds/min), 46,098,600 states left on queue.
+Progress(36) at 2024-11-06 16:23:51: 1,557,820,328 states generated (29,203,693 s/min), 161,651,516 distinct states found (3,007,605 ds/min), 46,958,572 states left on queue.
+Progress(36) at 2024-11-06 16:24:51: 1,587,341,565 states generated (29,521,237 s/min), 164,469,424 distinct states found (2,817,908 ds/min), 47,648,932 states left on queue.
+Progress(36) at 2024-11-06 16:25:51: 1,616,246,807 states generated (28,905,242 s/min), 167,471,199 distinct states found (3,001,775 ds/min), 48,496,844 states left on queue.
+Progress(36) at 2024-11-06 16:26:51: 1,645,107,613 states generated (28,860,806 s/min), 170,454,103 distinct states found (2,982,904 ds/min), 49,283,244 states left on queue.
+Progress(36) at 2024-11-06 16:27:51: 1,674,492,314 states generated (29,384,701 s/min), 173,343,045 distinct states found (2,888,942 ds/min), 50,006,895 states left on queue.
+Progress(36) at 2024-11-06 16:28:51: 1,703,875,027 states generated (29,382,713 s/min), 176,157,623 distinct states found (2,814,578 ds/min), 50,662,128 states left on queue.
+Progress(36) at 2024-11-06 16:29:51: 1,733,099,131 states generated (29,224,104 s/min), 179,186,519 distinct states found (3,028,896 ds/min), 51,498,029 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:30:52)
+Progress(36) at 2024-11-06 16:30:52: 1,762,724,622 states generated (29,625,491 s/min), 181,958,595 distinct states found (2,772,076 ds/min), 52,142,450 states left on queue.
+Progress(36) at 2024-11-06 16:31:52: 1,792,118,288 states generated (29,393,666 s/min), 184,725,090 distinct states found (2,766,495 ds/min), 52,785,705 states left on queue.
+Progress(36) at 2024-11-06 16:32:52: 1,821,258,069 states generated (29,139,781 s/min), 187,681,452 distinct states found (2,956,362 ds/min), 53,592,610 states left on queue.
+Progress(36) at 2024-11-06 16:33:52: 1,850,729,054 states generated (29,470,985 s/min), 190,451,722 distinct states found (2,770,270 ds/min), 54,239,919 states left on queue.
+Progress(36) at 2024-11-06 16:34:52: 1,879,860,913 states generated (29,131,859 s/min), 193,207,770 distinct states found (2,756,048 ds/min), 54,886,748 states left on queue.
+Progress(36) at 2024-11-06 16:35:52: 1,909,200,565 states generated (29,339,652 s/min), 195,832,123 distinct states found (2,624,353 ds/min), 55,404,535 states left on queue.
+Progress(36) at 2024-11-06 16:36:52: 1,938,403,873 states generated (29,203,308 s/min), 198,569,916 distinct states found (2,737,793 ds/min), 55,993,675 states left on queue.
+Progress(36) at 2024-11-06 16:37:52: 1,968,097,695 states generated (29,693,822 s/min), 201,148,799 distinct states found (2,578,883 ds/min), 56,501,179 states left on queue.
+Progress(36) at 2024-11-06 16:38:52: 1,997,628,304 states generated (29,530,609 s/min), 203,860,765 distinct states found (2,711,966 ds/min), 57,133,283 states left on queue.
+Progress(36) at 2024-11-06 16:39:52: 2,027,338,755 states generated (29,710,451 s/min), 206,496,491 distinct states found (2,635,726 ds/min), 57,649,914 states left on queue.
+Progress(36) at 2024-11-06 16:40:52: 2,057,072,538 states generated (29,733,783 s/min), 209,189,488 distinct states found (2,692,997 ds/min), 58,229,449 states left on queue.
+Progress(36) at 2024-11-06 16:41:52: 2,086,549,250 states generated (29,476,712 s/min), 211,909,869 distinct states found (2,720,381 ds/min), 58,875,611 states left on queue.
+Progress(37) at 2024-11-06 16:42:52: 2,115,953,926 states generated (29,404,676 s/min), 214,630,876 distinct states found (2,721,007 ds/min), 59,494,220 states left on queue.
+Progress(37) at 2024-11-06 16:43:52: 2,145,423,196 states generated (29,469,270 s/min), 217,412,888 distinct states found (2,782,012 ds/min), 60,176,423 states left on queue.
+Progress(37) at 2024-11-06 16:44:52: 2,174,796,796 states generated (29,373,600 s/min), 220,316,140 distinct states found (2,903,252 ds/min), 60,925,815 states left on queue.
+Progress(37) at 2024-11-06 16:45:52: 2,203,907,384 states generated (29,110,588 s/min), 223,255,125 distinct states found (2,938,985 ds/min), 61,739,564 states left on queue.
+Progress(37) at 2024-11-06 16:46:52: 2,233,378,272 states generated (29,470,888 s/min), 225,995,858 distinct states found (2,740,733 ds/min), 62,364,627 states left on queue.
+Progress(37) at 2024-11-06 16:47:52: 2,262,648,334 states generated (29,270,062 s/min), 228,738,653 distinct states found (2,742,795 ds/min), 63,003,155 states left on queue.
+Progress(37) at 2024-11-06 16:48:52: 2,291,309,648 states generated (28,661,314 s/min), 231,720,498 distinct states found (2,981,845 ds/min), 63,816,162 states left on queue.
+Progress(37) at 2024-11-06 16:49:52: 2,320,153,384 states generated (28,843,736 s/min), 234,599,475 distinct states found (2,878,977 ds/min), 64,513,886 states left on queue.
+Progress(37) at 2024-11-06 16:50:52: 2,349,538,907 states generated (29,385,523 s/min), 237,330,640 distinct states found (2,731,165 ds/min), 65,105,576 states left on queue.
+Progress(37) at 2024-11-06 16:51:52: 2,379,015,082 states generated (29,476,175 s/min), 240,064,625 distinct states found (2,733,985 ds/min), 65,704,108 states left on queue.
+Progress(37) at 2024-11-06 16:52:52: 2,408,376,582 states generated (29,361,500 s/min), 242,869,889 distinct states found (2,805,264 ds/min), 66,339,299 states left on queue.
+Progress(37) at 2024-11-06 16:53:52: 2,437,554,516 states generated (29,177,934 s/min), 245,844,106 distinct states found (2,974,217 ds/min), 67,125,834 states left on queue.
+Progress(37) at 2024-11-06 16:54:52: 2,466,925,193 states generated (29,370,677 s/min), 248,540,587 distinct states found (2,696,481 ds/min), 67,707,623 states left on queue.
+Progress(37) at 2024-11-06 16:55:52: 2,496,386,977 states generated (29,461,784 s/min), 251,318,893 distinct states found (2,778,306 ds/min), 68,345,796 states left on queue.
+Progress(37) at 2024-11-06 16:56:52: 2,525,837,965 states generated (29,450,988 s/min), 253,918,986 distinct states found (2,600,093 ds/min), 68,851,521 states left on queue.
+Progress(37) at 2024-11-06 16:57:52: 2,555,073,687 states generated (29,235,722 s/min), 256,806,753 distinct states found (2,887,767 ds/min), 69,596,597 states left on queue.
+Progress(37) at 2024-11-06 16:58:52: 2,584,381,294 states generated (29,307,607 s/min), 259,714,054 distinct states found (2,907,301 ds/min), 70,335,539 states left on queue.
+Progress(37) at 2024-11-06 16:59:52: 2,613,557,081 states generated (29,175,787 s/min), 262,407,462 distinct states found (2,693,408 ds/min), 70,920,265 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:00:53)
+Progress(37) at 2024-11-06 17:00:53: 2,643,168,141 states generated (29,611,060 s/min), 264,973,171 distinct states found (2,565,709 ds/min), 71,384,749 states left on queue.
+Progress(37) at 2024-11-06 17:01:53: 2,672,453,868 states generated (29,285,727 s/min), 267,551,971 distinct states found (2,578,800 ds/min), 71,854,220 states left on queue.
+Progress(37) at 2024-11-06 17:02:53: 2,701,696,399 states generated (29,242,531 s/min), 270,233,135 distinct states found (2,681,164 ds/min), 72,406,567 states left on queue.
+Progress(37) at 2024-11-06 17:03:53: 2,731,216,488 states generated (29,520,089 s/min), 272,711,390 distinct states found (2,478,255 ds/min), 72,805,269 states left on queue.
+Progress(37) at 2024-11-06 17:04:53: 2,760,788,758 states generated (29,572,270 s/min), 275,307,217 distinct states found (2,595,827 ds/min), 73,313,123 states left on queue.
+Progress(37) at 2024-11-06 17:05:53: 2,790,339,552 states generated (29,550,794 s/min), 277,881,113 distinct states found (2,573,896 ds/min), 73,833,900 states left on queue.
+Progress(37) at 2024-11-06 17:06:53: 2,820,046,206 states generated (29,706,654 s/min), 280,371,086 distinct states found (2,489,973 ds/min), 74,231,258 states left on queue.
+Progress(37) at 2024-11-06 17:07:53: 2,849,787,753 states generated (29,741,547 s/min), 283,097,131 distinct states found (2,726,045 ds/min), 74,814,735 states left on queue.
+Progress(37) at 2024-11-06 17:08:53: 2,879,520,949 states generated (29,733,196 s/min), 285,608,053 distinct states found (2,510,922 ds/min), 75,293,894 states left on queue.
+Progress(37) at 2024-11-06 17:09:53: 2,908,889,760 states generated (29,368,811 s/min), 288,274,872 distinct states found (2,666,819 ds/min), 75,880,480 states left on queue.
+Progress(38) at 2024-11-06 17:10:53: 2,938,412,523 states generated (29,522,763 s/min), 290,877,598 distinct states found (2,602,726 ds/min), 76,391,156 states left on queue.
+Progress(38) at 2024-11-06 17:11:53: 2,967,963,455 states generated (29,550,932 s/min), 293,492,146 distinct states found (2,614,548 ds/min), 76,932,124 states left on queue.
+Progress(38) at 2024-11-06 17:12:53: 2,997,327,370 states generated (29,363,915 s/min), 296,353,306 distinct states found (2,861,160 ds/min), 77,659,606 states left on queue.
+Progress(38) at 2024-11-06 17:13:53: 3,026,713,138 states generated (29,385,768 s/min), 299,173,963 distinct states found (2,820,657 ds/min), 78,342,645 states left on queue.
+Progress(38) at 2024-11-06 17:14:53: 3,055,986,492 states generated (29,273,354 s/min), 302,024,049 distinct states found (2,850,086 ds/min), 79,071,501 states left on queue.
+Progress(38) at 2024-11-06 17:15:53: 3,085,491,974 states generated (29,505,482 s/min), 304,668,970 distinct states found (2,644,921 ds/min), 79,608,084 states left on queue.
+Progress(38) at 2024-11-06 17:16:53: 3,114,898,266 states generated (29,406,292 s/min), 307,272,526 distinct states found (2,603,556 ds/min), 80,132,575 states left on queue.
+Progress(38) at 2024-11-06 17:17:53: 3,144,023,490 states generated (29,125,224 s/min), 310,022,073 distinct states found (2,749,547 ds/min), 80,777,238 states left on queue.
+Progress(38) at 2024-11-06 17:18:53: 3,172,762,795 states generated (28,739,305 s/min), 312,891,905 distinct states found (2,869,832 ds/min), 81,497,739 states left on queue.
+Progress(38) at 2024-11-06 17:19:53: 3,201,314,425 states generated (28,551,630 s/min), 315,766,566 distinct states found (2,874,661 ds/min), 82,171,729 states left on queue.
+Progress(38) at 2024-11-06 17:20:53: 3,230,713,777 states generated (29,399,352 s/min), 318,365,612 distinct states found (2,599,046 ds/min), 82,638,018 states left on queue.
+Progress(38) at 2024-11-06 17:21:53: 3,260,188,634 states generated (29,474,857 s/min), 321,040,810 distinct states found (2,675,198 ds/min), 83,185,708 states left on queue.
+Progress(38) at 2024-11-06 17:22:53: 3,289,654,456 states generated (29,465,822 s/min), 323,660,313 distinct states found (2,619,503 ds/min), 83,689,075 states left on queue.
+Progress(38) at 2024-11-06 17:23:53: 3,319,003,677 states generated (29,349,221 s/min), 326,391,347 distinct states found (2,731,034 ds/min), 84,261,368 states left on queue.
+Progress(38) at 2024-11-06 17:24:53: 3,348,330,685 states generated (29,327,008 s/min), 329,204,934 distinct states found (2,813,587 ds/min), 84,925,046 states left on queue.
+Progress(38) at 2024-11-06 17:25:53: 3,377,572,946 states generated (29,242,261 s/min), 331,997,887 distinct states found (2,792,953 ds/min), 85,533,473 states left on queue.
+Progress(38) at 2024-11-06 17:26:53: 3,406,881,714 states generated (29,308,768 s/min), 334,599,745 distinct states found (2,601,858 ds/min), 86,047,276 states left on queue.
+Progress(38) at 2024-11-06 17:27:53: 3,436,375,389 states generated (29,493,675 s/min), 337,261,572 distinct states found (2,661,827 ds/min), 86,591,357 states left on queue.
+Progress(38) at 2024-11-06 17:28:53: 3,465,811,732 states generated (29,436,343 s/min), 339,829,613 distinct states found (2,568,041 ds/min), 87,057,550 states left on queue.
+Progress(38) at 2024-11-06 17:29:53: 3,495,144,983 states generated (29,333,251 s/min), 342,566,275 distinct states found (2,736,662 ds/min), 87,671,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:30:53)
+Progress(38) at 2024-11-06 17:30:53: 3,524,611,246 states generated (29,466,263 s/min), 345,366,358 distinct states found (2,800,083 ds/min), 88,316,673 states left on queue.
+Progress(38) at 2024-11-06 17:31:53: 3,553,819,331 states generated (29,208,085 s/min), 348,291,666 distinct states found (2,925,308 ds/min), 89,059,679 states left on queue.
+Progress(38) at 2024-11-06 17:32:53: 3,583,208,821 states generated (29,389,490 s/min), 350,796,636 distinct states found (2,504,970 ds/min), 89,478,521 states left on queue.
+Progress(38) at 2024-11-06 17:33:53: 3,612,329,910 states generated (29,121,089 s/min), 353,414,448 distinct states found (2,617,812 ds/min), 90,008,568 states left on queue.
+Progress(38) at 2024-11-06 17:34:53: 3,641,485,253 states generated (29,155,343 s/min), 356,010,441 distinct states found (2,595,993 ds/min), 90,486,313 states left on queue.
+Progress(38) at 2024-11-06 17:35:53: 3,670,761,645 states generated (29,276,392 s/min), 358,411,973 distinct states found (2,401,532 ds/min), 90,799,029 states left on queue.
+Progress(38) at 2024-11-06 17:36:53: 3,700,008,207 states generated (29,246,562 s/min), 360,943,422 distinct states found (2,531,449 ds/min), 91,235,694 states left on queue.
+Progress(38) at 2024-11-06 17:37:53: 3,729,045,761 states generated (29,037,554 s/min), 363,523,499 distinct states found (2,580,077 ds/min), 91,685,579 states left on queue.
+Progress(38) at 2024-11-06 17:38:53: 3,758,697,262 states generated (29,651,501 s/min), 365,860,396 distinct states found (2,336,897 ds/min), 92,003,313 states left on queue.
+Progress(38) at 2024-11-06 17:39:53: 3,788,188,489 states generated (29,491,227 s/min), 368,369,398 distinct states found (2,509,002 ds/min), 92,452,083 states left on queue.
+Progress(38) at 2024-11-06 17:40:53: 3,817,718,772 states generated (29,530,283 s/min), 370,855,965 distinct states found (2,486,567 ds/min), 92,899,812 states left on queue.
+Progress(38) at 2024-11-06 17:41:53: 3,847,372,748 states generated (29,653,976 s/min), 373,231,774 distinct states found (2,375,809 ds/min), 93,202,503 states left on queue.
+Progress(38) at 2024-11-06 17:42:53: 3,877,091,950 states generated (29,719,202 s/min), 375,934,374 distinct states found (2,702,600 ds/min), 93,775,105 states left on queue.
+Progress(38) at 2024-11-06 17:43:53: 3,906,843,295 states generated (29,751,345 s/min), 378,304,497 distinct states found (2,370,123 ds/min), 94,098,611 states left on queue.
+Progress(38) at 2024-11-06 17:44:53: 3,936,304,033 states generated (29,460,738 s/min), 380,793,774 distinct states found (2,489,277 ds/min), 94,560,398 states left on queue.
+Progress(38) at 2024-11-06 17:45:53: 3,965,687,311 states generated (29,383,278 s/min), 383,366,376 distinct states found (2,572,602 ds/min), 95,062,163 states left on queue.
+Progress(38) at 2024-11-06 17:46:53: 3,995,264,758 states generated (29,577,447 s/min), 385,832,314 distinct states found (2,465,938 ds/min), 95,460,777 states left on queue.
+Progress(38) at 2024-11-06 17:47:53: 4,024,519,333 states generated (29,254,575 s/min), 388,384,282 distinct states found (2,551,968 ds/min), 95,931,698 states left on queue.
+Progress(38) at 2024-11-06 17:48:53: 4,054,053,752 states generated (29,534,419 s/min), 390,990,581 distinct states found (2,606,299 ds/min), 96,493,705 states left on queue.
+Progress(38) at 2024-11-06 17:49:53: 4,083,403,606 states generated (29,349,854 s/min), 393,717,328 distinct states found (2,726,747 ds/min), 97,099,592 states left on queue.
+Progress(38) at 2024-11-06 17:50:53: 4,112,753,694 states generated (29,350,088 s/min), 396,441,909 distinct states found (2,724,581 ds/min), 97,694,523 states left on queue.
+Progress(38) at 2024-11-06 17:51:53: 4,141,940,951 states generated (29,187,257 s/min), 399,238,612 distinct states found (2,796,703 ds/min), 98,387,103 states left on queue.
+Progress(38) at 2024-11-06 17:52:53: 4,171,185,273 states generated (29,244,322 s/min), 401,861,376 distinct states found (2,622,764 ds/min), 98,900,168 states left on queue.
+Progress(38) at 2024-11-06 17:53:53: 4,200,735,055 states generated (29,549,782 s/min), 404,419,627 distinct states found (2,558,251 ds/min), 99,388,507 states left on queue.
+Progress(38) at 2024-11-06 17:54:53: 4,230,057,902 states generated (29,322,847 s/min), 406,926,477 distinct states found (2,506,850 ds/min), 99,826,562 states left on queue.
+Progress(38) at 2024-11-06 17:55:53: 4,259,279,515 states generated (29,221,613 s/min), 409,512,606 distinct states found (2,586,129 ds/min), 100,340,214 states left on queue.
+Progress(38) at 2024-11-06 17:56:53: 4,288,265,663 states generated (28,986,148 s/min), 412,254,402 distinct states found (2,741,796 ds/min), 100,966,036 states left on queue.
+Progress(38) at 2024-11-06 17:57:53: 4,316,798,413 states generated (28,532,750 s/min), 415,047,481 distinct states found (2,793,079 ds/min), 101,589,869 states left on queue.
+Progress(38) at 2024-11-06 17:58:53: 4,345,527,290 states generated (28,728,877 s/min), 417,768,588 distinct states found (2,721,107 ds/min), 102,133,503 states left on queue.
+Progress(38) at 2024-11-06 17:59:53: 4,374,924,942 states generated (29,397,652 s/min), 420,254,082 distinct states found (2,485,494 ds/min), 102,500,461 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:00:54)
+Progress(38) at 2024-11-06 18:00:54: 4,404,604,911 states generated (29,679,969 s/min), 422,801,691 distinct states found (2,547,609 ds/min), 102,936,440 states left on queue.
+Progress(38) at 2024-11-06 18:01:54: 4,434,018,901 states generated (29,413,990 s/min), 425,477,119 distinct states found (2,675,428 ds/min), 103,472,987 states left on queue.
+Progress(38) at 2024-11-06 18:02:54: 4,463,498,297 states generated (29,479,396 s/min), 427,949,289 distinct states found (2,472,170 ds/min), 103,858,839 states left on queue.
+Progress(38) at 2024-11-06 18:03:54: 4,492,775,931 states generated (29,277,634 s/min), 430,592,094 distinct states found (2,642,805 ds/min), 104,353,609 states left on queue.
+Progress(38) at 2024-11-06 18:04:54: 4,522,002,300 states generated (29,226,369 s/min), 433,322,584 distinct states found (2,730,490 ds/min), 104,949,753 states left on queue.
+Progress(38) at 2024-11-06 18:05:54: 4,551,375,180 states generated (29,372,880 s/min), 436,005,138 distinct states found (2,682,554 ds/min), 105,482,546 states left on queue.
+Progress(38) at 2024-11-06 18:06:54: 4,580,718,169 states generated (29,342,989 s/min), 438,516,579 distinct states found (2,511,441 ds/min), 105,868,435 states left on queue.
+Progress(38) at 2024-11-06 18:07:54: 4,609,859,344 states generated (29,141,175 s/min), 441,134,700 distinct states found (2,618,121 ds/min), 106,390,335 states left on queue.
+Progress(38) at 2024-11-06 18:08:54: 4,639,331,150 states generated (29,471,806 s/min), 443,662,679 distinct states found (2,527,979 ds/min), 106,821,264 states left on queue.
+Progress(38) at 2024-11-06 18:09:54: 4,668,696,820 states generated (29,365,670 s/min), 446,222,969 distinct states found (2,560,290 ds/min), 107,277,508 states left on queue.
+Progress(38) at 2024-11-06 18:10:54: 4,698,140,829 states generated (29,444,009 s/min), 448,693,022 distinct states found (2,470,053 ds/min), 107,654,262 states left on queue.
+Progress(38) at 2024-11-06 18:11:54: 4,727,380,985 states generated (29,240,156 s/min), 451,459,276 distinct states found (2,766,254 ds/min), 108,284,101 states left on queue.
+Progress(38) at 2024-11-06 18:12:54: 4,756,654,088 states generated (29,273,103 s/min), 454,180,180 distinct states found (2,720,904 ds/min), 108,879,205 states left on queue.
+Progress(38) at 2024-11-06 18:13:54: 4,785,893,104 states generated (29,239,016 s/min), 457,001,077 distinct states found (2,820,897 ds/min), 109,511,015 states left on queue.
+Progress(38) at 2024-11-06 18:14:54: 4,815,289,339 states generated (29,396,235 s/min), 459,530,340 distinct states found (2,529,263 ds/min), 109,951,588 states left on queue.
+Progress(38) at 2024-11-06 18:15:54: 4,844,354,767 states generated (29,065,428 s/min), 462,144,567 distinct states found (2,614,227 ds/min), 110,455,692 states left on queue.
+Progress(38) at 2024-11-06 18:16:54: 4,873,381,465 states generated (29,026,698 s/min), 464,718,128 distinct states found (2,573,561 ds/min), 110,936,992 states left on queue.
+Progress(38) at 2024-11-06 18:17:54: 4,902,616,179 states generated (29,234,714 s/min), 467,171,620 distinct states found (2,453,492 ds/min), 111,288,450 states left on queue.
+Progress(38) at 2024-11-06 18:18:54: 4,931,808,383 states generated (29,192,204 s/min), 469,593,253 distinct states found (2,421,633 ds/min), 111,607,240 states left on queue.
+Progress(38) at 2024-11-06 18:19:54: 4,961,319,800 states generated (29,511,417 s/min), 471,795,067 distinct states found (2,201,814 ds/min), 111,770,077 states left on queue.
+Progress(38) at 2024-11-06 18:20:54: 4,990,051,892 states generated (28,732,092 s/min), 474,595,717 distinct states found (2,800,650 ds/min), 112,380,795 states left on queue.
+Progress(38) at 2024-11-06 18:21:54: 5,019,620,389 states generated (29,568,497 s/min), 476,860,178 distinct states found (2,264,461 ds/min), 112,610,789 states left on queue.
+Progress(38) at 2024-11-06 18:22:54: 5,049,176,225 states generated (29,555,836 s/min), 479,117,000 distinct states found (2,256,822 ds/min), 112,849,809 states left on queue.
+Progress(38) at 2024-11-06 18:23:54: 5,078,659,511 states generated (29,483,286 s/min), 481,552,566 distinct states found (2,435,566 ds/min), 113,238,679 states left on queue.
+Progress(38) at 2024-11-06 18:24:54: 5,108,186,428 states generated (29,526,917 s/min), 483,970,290 distinct states found (2,417,724 ds/min), 113,645,974 states left on queue.
+Progress(38) at 2024-11-06 18:25:54: 5,137,766,496 states generated (29,580,068 s/min), 486,204,445 distinct states found (2,234,155 ds/min), 113,816,273 states left on queue.
+Progress(38) at 2024-11-06 18:26:54: 5,167,429,477 states generated (29,662,981 s/min), 488,726,479 distinct states found (2,522,034 ds/min), 114,265,425 states left on queue.
+Progress(38) at 2024-11-06 18:27:54: 5,197,227,715 states generated (29,798,238 s/min), 491,213,848 distinct states found (2,487,369 ds/min), 114,645,624 states left on queue.
+Progress(38) at 2024-11-06 18:28:54: 5,226,883,420 states generated (29,655,705 s/min), 493,480,968 distinct states found (2,267,120 ds/min), 114,901,786 states left on queue.
+Progress(38) at 2024-11-06 18:29:54: 5,256,355,905 states generated (29,472,485 s/min), 495,866,549 distinct states found (2,385,581 ds/min), 115,277,276 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:30:55)
+Progress(38) at 2024-11-06 18:30:55: 5,286,035,252 states generated (29,679,347 s/min), 498,324,679 distinct states found (2,458,130 ds/min), 115,663,015 states left on queue.
+Progress(38) at 2024-11-06 18:31:55: 5,315,467,724 states generated (29,432,472 s/min), 500,723,577 distinct states found (2,398,898 ds/min), 116,023,619 states left on queue.
+Progress(38) at 2024-11-06 18:32:55: 5,344,728,453 states generated (29,260,729 s/min), 503,156,876 distinct states found (2,433,299 ds/min), 116,384,801 states left on queue.
+Progress(38) at 2024-11-06 18:33:55: 5,374,055,231 states generated (29,326,778 s/min), 505,588,957 distinct states found (2,432,081 ds/min), 116,786,679 states left on queue.
+Progress(38) at 2024-11-06 18:34:55: 5,403,566,278 states generated (29,511,047 s/min), 508,096,703 distinct states found (2,507,746 ds/min), 117,258,425 states left on queue.
+Progress(38) at 2024-11-06 18:35:55: 5,432,770,932 states generated (29,204,654 s/min), 510,765,370 distinct states found (2,668,667 ds/min), 117,821,443 states left on queue.
+Progress(38) at 2024-11-06 18:36:55: 5,462,325,607 states generated (29,554,675 s/min), 513,306,027 distinct states found (2,540,657 ds/min), 118,252,946 states left on queue.
+Progress(38) at 2024-11-06 18:37:55: 5,491,531,381 states generated (29,205,774 s/min), 516,017,383 distinct states found (2,711,356 ds/min), 118,857,035 states left on queue.
+Progress(38) at 2024-11-06 18:38:55: 5,520,744,572 states generated (29,213,191 s/min), 518,696,783 distinct states found (2,679,400 ds/min), 119,445,954 states left on queue.
+Progress(38) at 2024-11-06 18:39:55: 5,549,903,819 states generated (29,159,247 s/min), 521,329,662 distinct states found (2,632,879 ds/min), 119,977,569 states left on queue.
+Progress(38) at 2024-11-06 18:40:55: 5,579,474,839 states generated (29,571,020 s/min), 523,702,578 distinct states found (2,372,916 ds/min), 120,289,041 states left on queue.
+Progress(38) at 2024-11-06 18:41:55: 5,608,757,550 states generated (29,282,711 s/min), 526,191,629 distinct states found (2,489,051 ds/min), 120,719,632 states left on queue.
+Progress(38) at 2024-11-06 18:42:55: 5,638,085,090 states generated (29,327,540 s/min), 528,478,505 distinct states found (2,286,876 ds/min), 120,990,568 states left on queue.
+Progress(38) at 2024-11-06 18:43:55: 5,667,141,833 states generated (29,056,743 s/min), 531,035,593 distinct states found (2,557,088 ds/min), 121,480,763 states left on queue.
+Progress(38) at 2024-11-06 18:44:55: 5,696,139,104 states generated (28,997,271 s/min), 533,684,330 distinct states found (2,648,737 ds/min), 122,027,516 states left on queue.
+Progress(38) at 2024-11-06 18:45:55: 5,724,868,902 states generated (28,729,798 s/min), 536,316,715 distinct states found (2,632,385 ds/min), 122,548,317 states left on queue.
+Progress(38) at 2024-11-06 18:46:55: 5,753,438,871 states generated (28,569,969 s/min), 539,001,028 distinct states found (2,684,313 ds/min), 123,041,578 states left on queue.
+Progress(38) at 2024-11-06 18:47:55: 5,782,391,778 states generated (28,952,907 s/min), 541,537,259 distinct states found (2,536,231 ds/min), 123,436,184 states left on queue.
+Progress(38) at 2024-11-06 18:48:55: 5,811,823,996 states generated (29,432,218 s/min), 543,896,432 distinct states found (2,359,173 ds/min), 123,698,698 states left on queue.
+Progress(38) at 2024-11-06 18:49:55: 5,841,258,941 states generated (29,434,945 s/min), 546,273,191 distinct states found (2,376,759 ds/min), 124,012,754 states left on queue.
+Progress(38) at 2024-11-06 18:50:55: 5,870,667,995 states generated (29,409,054 s/min), 548,835,686 distinct states found (2,562,495 ds/min), 124,450,482 states left on queue.
+Progress(38) at 2024-11-06 18:51:55: 5,900,038,718 states generated (29,370,723 s/min), 551,304,457 distinct states found (2,468,771 ds/min), 124,805,220 states left on queue.
+Progress(38) at 2024-11-06 18:52:55: 5,929,442,421 states generated (29,403,703 s/min), 553,776,296 distinct states found (2,471,839 ds/min), 125,178,608 states left on queue.
+Progress(38) at 2024-11-06 18:53:55: 5,958,838,496 states generated (29,396,075 s/min), 556,289,762 distinct states found (2,513,466 ds/min), 125,588,158 states left on queue.
+Progress(38) at 2024-11-06 18:54:55: 5,988,187,325 states generated (29,348,829 s/min), 558,898,224 distinct states found (2,608,462 ds/min), 126,074,377 states left on queue.
+Progress(38) at 2024-11-06 18:55:55: 6,017,546,111 states generated (29,358,786 s/min), 561,530,468 distinct states found (2,632,244 ds/min), 126,579,784 states left on queue.
+Progress(38) at 2024-11-06 18:56:55: 6,046,777,143 states generated (29,231,032 s/min), 564,182,546 distinct states found (2,652,078 ds/min), 127,037,883 states left on queue.
+Progress(39) at 2024-11-06 18:57:55: 6,076,111,479 states generated (29,334,336 s/min), 566,509,898 distinct states found (2,327,352 ds/min), 127,319,036 states left on queue.
+Progress(39) at 2024-11-06 18:58:55: 6,105,215,668 states generated (29,104,189 s/min), 569,000,954 distinct states found (2,491,056 ds/min), 127,724,185 states left on queue.
+Progress(39) at 2024-11-06 18:59:55: 6,134,619,650 states generated (29,403,982 s/min), 571,444,199 distinct states found (2,443,245 ds/min), 128,083,849 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:00:55)
+Progress(39) at 2024-11-06 19:00:55: 6,164,303,226 states generated (29,683,576 s/min), 574,046,920 distinct states found (2,602,721 ds/min), 128,537,330 states left on queue.
+Progress(39) at 2024-11-06 19:01:55: 6,193,710,515 states generated (29,407,289 s/min), 576,294,161 distinct states found (2,247,241 ds/min), 128,749,186 states left on queue.
+Progress(39) at 2024-11-06 19:02:55: 6,223,050,437 states generated (29,339,922 s/min), 578,840,811 distinct states found (2,546,650 ds/min), 129,198,375 states left on queue.
+Progress(39) at 2024-11-06 19:03:55: 6,252,273,339 states generated (29,222,902 s/min), 581,530,481 distinct states found (2,689,670 ds/min), 129,745,195 states left on queue.
+Progress(39) at 2024-11-06 19:04:55: 6,281,535,213 states generated (29,261,874 s/min), 584,206,969 distinct states found (2,676,488 ds/min), 130,306,182 states left on queue.
+Progress(39) at 2024-11-06 19:05:55: 6,310,569,147 states generated (29,033,934 s/min), 587,031,959 distinct states found (2,824,990 ds/min), 130,922,629 states left on queue.
+Progress(39) at 2024-11-06 19:06:55: 6,339,951,741 states generated (29,382,594 s/min), 589,709,668 distinct states found (2,677,709 ds/min), 131,483,555 states left on queue.
+Progress(39) at 2024-11-06 19:07:55: 6,369,354,481 states generated (29,402,740 s/min), 591,964,654 distinct states found (2,254,986 ds/min), 131,688,532 states left on queue.
+Progress(39) at 2024-11-06 19:08:55: 6,398,254,591 states generated (28,900,110 s/min), 594,604,924 distinct states found (2,640,270 ds/min), 132,195,069 states left on queue.
+Progress(39) at 2024-11-06 19:09:55: 6,427,422,756 states generated (29,168,165 s/min), 597,059,083 distinct states found (2,454,159 ds/min), 132,571,626 states left on queue.
+Progress(39) at 2024-11-06 19:10:55: 6,456,469,721 states generated (29,046,965 s/min), 599,400,317 distinct states found (2,341,234 ds/min), 132,826,474 states left on queue.
+Progress(39) at 2024-11-06 19:11:55: 6,485,733,442 states generated (29,263,721 s/min), 602,040,336 distinct states found (2,640,019 ds/min), 133,286,664 states left on queue.
+Progress(39) at 2024-11-06 19:12:55: 6,515,001,998 states generated (29,268,556 s/min), 604,003,958 distinct states found (1,963,622 ds/min), 133,255,252 states left on queue.
+Progress(39) at 2024-11-06 19:13:55: 6,544,172,146 states generated (29,170,148 s/min), 606,473,164 distinct states found (2,469,206 ds/min), 133,627,323 states left on queue.
+Progress(39) at 2024-11-06 19:14:55: 6,572,975,355 states generated (28,803,209 s/min), 609,043,606 distinct states found (2,570,442 ds/min), 134,023,262 states left on queue.
+Progress(39) at 2024-11-06 19:15:55: 6,602,534,934 states generated (29,559,579 s/min), 611,212,652 distinct states found (2,169,046 ds/min), 134,205,070 states left on queue.
+Progress(39) at 2024-11-06 19:16:55: 6,632,044,851 states generated (29,509,917 s/min), 613,377,378 distinct states found (2,164,726 ds/min), 134,360,577 states left on queue.
+Progress(39) at 2024-11-06 19:17:55: 6,661,465,356 states generated (29,420,505 s/min), 615,729,605 distinct states found (2,352,227 ds/min), 134,679,148 states left on queue.
+Progress(39) at 2024-11-06 19:18:55: 6,690,848,776 states generated (29,383,420 s/min), 618,034,126 distinct states found (2,304,521 ds/min), 134,989,999 states left on queue.
+Progress(39) at 2024-11-06 19:19:55: 6,720,362,641 states generated (29,513,865 s/min), 620,264,990 distinct states found (2,230,864 ds/min), 135,213,527 states left on queue.
+Progress(39) at 2024-11-06 19:20:55: 6,749,995,972 states generated (29,633,331 s/min), 622,424,423 distinct states found (2,159,433 ds/min), 135,336,269 states left on queue.
+Progress(39) at 2024-11-06 19:21:55: 6,779,641,479 states generated (29,645,507 s/min), 624,953,002 distinct states found (2,528,579 ds/min), 135,781,717 states left on queue.
+Progress(39) at 2024-11-06 19:22:55: 6,809,496,805 states generated (29,855,326 s/min), 627,297,563 distinct states found (2,344,561 ds/min), 136,040,988 states left on queue.
+Progress(39) at 2024-11-06 19:23:55: 6,839,096,708 states generated (29,599,903 s/min), 629,464,688 distinct states found (2,167,125 ds/min), 136,210,971 states left on queue.
+Progress(39) at 2024-11-06 19:24:55: 6,868,614,311 states generated (29,517,603 s/min), 631,704,627 distinct states found (2,239,939 ds/min), 136,469,731 states left on queue.
+Progress(39) at 2024-11-06 19:25:55: 6,897,932,930 states generated (29,318,619 s/min), 633,961,042 distinct states found (2,256,415 ds/min), 136,714,912 states left on queue.
+Progress(39) at 2024-11-06 19:26:55: 6,927,200,602 states generated (29,267,672 s/min), 636,414,800 distinct states found (2,453,758 ds/min), 137,101,547 states left on queue.
+Progress(39) at 2024-11-06 19:27:55: 6,956,755,074 states generated (29,554,472 s/min), 638,616,489 distinct states found (2,201,689 ds/min), 137,285,238 states left on queue.
+Progress(39) at 2024-11-06 19:28:55: 6,985,926,285 states generated (29,171,211 s/min), 640,970,274 distinct states found (2,353,785 ds/min), 137,592,586 states left on queue.
+Progress(39) at 2024-11-06 19:29:55: 7,015,240,294 states generated (29,314,009 s/min), 643,310,280 distinct states found (2,340,006 ds/min), 137,914,322 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:30:56)
+Progress(39) at 2024-11-06 19:30:56: 7,045,112,039 states generated (29,871,745 s/min), 645,650,251 distinct states found (2,339,971 ds/min), 138,248,533 states left on queue.
+Progress(39) at 2024-11-06 19:31:56: 7,074,347,122 states generated (29,235,083 s/min), 648,286,341 distinct states found (2,636,090 ds/min), 138,800,606 states left on queue.
+Progress(39) at 2024-11-06 19:32:56: 7,103,701,427 states generated (29,354,305 s/min), 650,776,754 distinct states found (2,490,413 ds/min), 139,200,935 states left on queue.
+Progress(39) at 2024-11-06 19:33:56: 7,133,125,574 states generated (29,424,147 s/min), 653,222,778 distinct states found (2,446,024 ds/min), 139,553,972 states left on queue.
+Progress(39) at 2024-11-06 19:34:56: 7,162,393,954 states generated (29,268,380 s/min), 655,812,815 distinct states found (2,590,037 ds/min), 140,051,736 states left on queue.
+Progress(39) at 2024-11-06 19:35:56: 7,191,614,309 states generated (29,220,355 s/min), 658,388,779 distinct states found (2,575,964 ds/min), 140,550,430 states left on queue.
+Progress(39) at 2024-11-06 19:36:56: 7,220,841,977 states generated (29,227,668 s/min), 660,885,901 distinct states found (2,497,122 ds/min), 140,973,038 states left on queue.
+Progress(39) at 2024-11-06 19:37:56: 7,250,020,241 states generated (29,178,264 s/min), 663,335,701 distinct states found (2,449,800 ds/min), 141,327,800 states left on queue.
+Progress(39) at 2024-11-06 19:38:56: 7,279,545,923 states generated (29,525,682 s/min), 665,706,252 distinct states found (2,370,551 ds/min), 141,666,628 states left on queue.
+Progress(39) at 2024-11-06 19:39:56: 7,308,806,585 states generated (29,260,662 s/min), 668,059,763 distinct states found (2,353,511 ds/min), 141,985,139 states left on queue.
+Progress(39) at 2024-11-06 19:40:56: 7,338,028,888 states generated (29,222,303 s/min), 670,241,848 distinct states found (2,182,085 ds/min), 142,169,842 states left on queue.
+Progress(39) at 2024-11-06 19:41:56: 7,367,241,753 states generated (29,212,865 s/min), 672,613,255 distinct states found (2,371,407 ds/min), 142,507,724 states left on queue.
+Progress(39) at 2024-11-06 19:42:56: 7,396,269,434 states generated (29,027,681 s/min), 675,112,517 distinct states found (2,499,262 ds/min), 142,941,967 states left on queue.
+Progress(39) at 2024-11-06 19:43:56: 7,425,237,701 states generated (28,968,267 s/min), 677,646,850 distinct states found (2,534,333 ds/min), 143,388,301 states left on queue.
+Progress(39) at 2024-11-06 19:44:56: 7,453,929,312 states generated (28,691,611 s/min), 680,183,486 distinct states found (2,536,636 ds/min), 143,823,998 states left on queue.
+Progress(39) at 2024-11-06 19:45:56: 7,482,605,282 states generated (28,675,970 s/min), 682,751,269 distinct states found (2,567,783 ds/min), 144,211,694 states left on queue.
+Progress(39) at 2024-11-06 19:46:56: 7,511,402,194 states generated (28,796,912 s/min), 685,177,338 distinct states found (2,426,069 ds/min), 144,502,576 states left on queue.
+Progress(39) at 2024-11-06 19:47:56: 7,540,667,315 states generated (29,265,121 s/min), 687,470,422 distinct states found (2,293,084 ds/min), 144,717,485 states left on queue.
+Progress(39) at 2024-11-06 19:48:56: 7,570,065,371 states generated (29,398,056 s/min), 689,724,172 distinct states found (2,253,750 ds/min), 144,895,541 states left on queue.
+Progress(39) at 2024-11-06 19:49:56: 7,599,596,791 states generated (29,531,420 s/min), 692,064,101 distinct states found (2,339,929 ds/min), 145,171,911 states left on queue.
+Progress(39) at 2024-11-06 19:50:56: 7,629,011,363 states generated (29,414,572 s/min), 694,540,161 distinct states found (2,476,060 ds/min), 145,540,423 states left on queue.
+Progress(39) at 2024-11-06 19:51:56: 7,658,453,965 states generated (29,442,602 s/min), 696,912,122 distinct states found (2,371,961 ds/min), 145,809,567 states left on queue.
+Progress(39) at 2024-11-06 19:52:56: 7,687,913,137 states generated (29,459,172 s/min), 699,240,630 distinct states found (2,328,508 ds/min), 146,098,273 states left on queue.
+Progress(39) at 2024-11-06 19:53:56: 7,717,161,254 states generated (29,248,117 s/min), 701,789,915 distinct states found (2,549,285 ds/min), 146,502,121 states left on queue.
+Progress(39) at 2024-11-06 19:54:56: 7,746,587,948 states generated (29,426,694 s/min), 704,037,014 distinct states found (2,247,099 ds/min), 146,684,369 states left on queue.
+Progress(39) at 2024-11-06 19:55:56: 7,775,767,241 states generated (29,179,293 s/min), 706,750,225 distinct states found (2,713,211 ds/min), 147,270,858 states left on queue.
+Progress(39) at 2024-11-06 19:56:56: 7,805,143,313 states generated (29,376,072 s/min), 709,214,940 distinct states found (2,464,715 ds/min), 147,627,166 states left on queue.
+Progress(39) at 2024-11-06 19:57:56: 7,834,403,478 states generated (29,260,165 s/min), 711,759,633 distinct states found (2,544,693 ds/min), 147,996,842 states left on queue.
+Progress(40) at 2024-11-06 19:58:56: 7,863,785,909 states generated (29,382,431 s/min), 713,915,903 distinct states found (2,156,270 ds/min), 148,107,480 states left on queue.
+Progress(40) at 2024-11-06 19:59:56: 7,892,661,923 states generated (28,876,014 s/min), 716,529,052 distinct states found (2,613,149 ds/min), 148,615,346 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:00:57)
+Progress(40) at 2024-11-06 20:00:57: 7,922,354,868 states generated (29,692,945 s/min), 718,724,840 distinct states found (2,195,788 ds/min), 148,760,464 states left on queue.
+Progress(40) at 2024-11-06 20:01:57: 7,951,821,345 states generated (29,466,477 s/min), 721,199,790 distinct states found (2,474,950 ds/min), 149,133,458 states left on queue.
+Progress(40) at 2024-11-06 20:02:57: 7,981,212,562 states generated (29,391,217 s/min), 723,637,084 distinct states found (2,437,294 ds/min), 149,453,388 states left on queue.
+Progress(40) at 2024-11-06 20:03:57: 8,010,639,344 states generated (29,426,782 s/min), 725,776,597 distinct states found (2,139,513 ds/min), 149,580,205 states left on queue.
+Progress(40) at 2024-11-06 20:04:57: 8,039,970,078 states generated (29,330,734 s/min), 728,145,896 distinct states found (2,369,299 ds/min), 149,873,787 states left on queue.
+Progress(40) at 2024-11-06 20:05:57: 8,069,221,501 states generated (29,251,423 s/min), 730,835,980 distinct states found (2,690,084 ds/min), 150,431,663 states left on queue.
+Progress(40) at 2024-11-06 20:06:57: 8,098,568,645 states generated (29,347,144 s/min), 733,266,238 distinct states found (2,430,258 ds/min), 150,772,190 states left on queue.
+Progress(40) at 2024-11-06 20:07:57: 8,127,646,970 states generated (29,078,325 s/min), 736,001,441 distinct states found (2,735,203 ds/min), 151,368,297 states left on queue.
+Progress(40) at 2024-11-06 20:08:57: 8,156,755,007 states generated (29,108,037 s/min), 738,759,675 distinct states found (2,758,234 ds/min), 151,912,929 states left on queue.
+Progress(40) at 2024-11-06 20:09:57: 8,186,234,810 states generated (29,479,803 s/min), 741,336,146 distinct states found (2,576,471 ds/min), 152,376,828 states left on queue.
+Progress(40) at 2024-11-06 20:10:57: 8,215,641,994 states generated (29,407,184 s/min), 743,647,353 distinct states found (2,311,207 ds/min), 152,617,899 states left on queue.
+Progress(40) at 2024-11-06 20:11:57: 8,244,746,445 states generated (29,104,451 s/min), 746,080,007 distinct states found (2,432,654 ds/min), 152,939,104 states left on queue.
+Progress(40) at 2024-11-06 20:12:57: 8,273,514,095 states generated (28,767,650 s/min), 748,726,701 distinct states found (2,646,694 ds/min), 153,445,645 states left on queue.
+Progress(40) at 2024-11-06 20:13:57: 8,302,647,011 states generated (29,132,916 s/min), 751,041,420 distinct states found (2,314,719 ds/min), 153,711,631 states left on queue.
+Progress(40) at 2024-11-06 20:14:57: 8,331,785,512 states generated (29,138,501 s/min), 753,262,324 distinct states found (2,220,904 ds/min), 153,861,206 states left on queue.
+Progress(40) at 2024-11-06 20:15:57: 8,361,058,813 states generated (29,273,301 s/min), 755,881,803 distinct states found (2,619,479 ds/min), 154,293,451 states left on queue.
+Progress(40) at 2024-11-06 20:16:57: 8,390,323,842 states generated (29,265,029 s/min), 757,769,813 distinct states found (1,888,010 ds/min), 154,184,183 states left on queue.
+Progress(40) at 2024-11-06 20:17:57: 8,419,579,524 states generated (29,255,682 s/min), 760,009,795 distinct states found (2,239,982 ds/min), 154,382,656 states left on queue.
+Progress(40) at 2024-11-06 20:18:57: 8,448,394,343 states generated (28,814,819 s/min), 762,597,225 distinct states found (2,587,430 ds/min), 154,795,314 states left on queue.
+Progress(40) at 2024-11-06 20:19:57: 8,477,530,142 states generated (29,135,799 s/min), 764,903,184 distinct states found (2,305,959 ds/min), 154,997,361 states left on queue.
+Progress(40) at 2024-11-06 20:20:57: 8,507,035,930 states generated (29,505,788 s/min), 766,887,142 distinct states found (1,983,958 ds/min), 155,034,831 states left on queue.
+Progress(40) at 2024-11-06 20:21:57: 8,536,505,703 states generated (29,469,773 s/min), 769,048,483 distinct states found (2,161,341 ds/min), 155,183,742 states left on queue.
+Progress(40) at 2024-11-06 20:22:57: 8,565,867,584 states generated (29,361,881 s/min), 771,258,076 distinct states found (2,209,593 ds/min), 155,385,262 states left on queue.
+Progress(40) at 2024-11-06 20:23:57: 8,595,185,764 states generated (29,318,180 s/min), 773,454,985 distinct states found (2,196,909 ds/min), 155,614,111 states left on queue.
+Progress(40) at 2024-11-06 20:24:57: 8,624,496,269 states generated (29,310,505 s/min), 775,619,630 distinct states found (2,164,645 ds/min), 155,798,174 states left on queue.
+Progress(40) at 2024-11-06 20:25:57: 8,654,080,073 states generated (29,583,804 s/min), 777,637,410 distinct states found (2,017,780 ds/min), 155,782,045 states left on queue.
+Progress(40) at 2024-11-06 20:26:57: 8,683,722,009 states generated (29,641,936 s/min), 779,940,399 distinct states found (2,302,989 ds/min), 156,073,330 states left on queue.
+Progress(40) at 2024-11-06 20:27:57: 8,713,410,725 states generated (29,688,716 s/min), 782,406,987 distinct states found (2,466,588 ds/min), 156,445,902 states left on queue.
+Progress(40) at 2024-11-06 20:28:57: 8,743,158,002 states generated (29,747,277 s/min), 784,542,609 distinct states found (2,135,622 ds/min), 156,539,841 states left on queue.
+Progress(40) at 2024-11-06 20:29:57: 8,772,688,809 states generated (29,530,807 s/min), 786,583,608 distinct states found (2,040,999 ds/min), 156,630,041 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:30:57)
+Progress(40) at 2024-11-06 20:30:57: 8,802,299,219 states generated (29,610,410 s/min), 788,709,007 distinct states found (2,125,399 ds/min), 156,780,966 states left on queue.
+Progress(40) at 2024-11-06 20:31:57: 8,831,545,663 states generated (29,246,444 s/min), 790,874,634 distinct states found (2,165,627 ds/min), 156,943,688 states left on queue.
+Progress(40) at 2024-11-06 20:32:57: 8,860,742,526 states generated (29,196,863 s/min), 793,218,612 distinct states found (2,343,978 ds/min), 157,247,738 states left on queue.
+Progress(40) at 2024-11-06 20:33:57: 8,890,145,689 states generated (29,403,163 s/min), 795,347,746 distinct states found (2,129,134 ds/min), 157,376,715 states left on queue.
+Progress(40) at 2024-11-06 20:34:57: 8,919,277,440 states generated (29,131,751 s/min), 797,557,991 distinct states found (2,210,245 ds/min), 157,566,508 states left on queue.
+Progress(40) at 2024-11-06 20:35:57: 8,948,368,355 states generated (29,090,915 s/min), 799,870,441 distinct states found (2,312,450 ds/min), 157,825,337 states left on queue.
+Progress(40) at 2024-11-06 20:36:57: 8,977,811,769 states generated (29,443,414 s/min), 801,992,418 distinct states found (2,121,977 ds/min), 158,015,008 states left on queue.
+Progress(40) at 2024-11-06 20:37:57: 9,007,285,675 states generated (29,473,906 s/min), 804,250,024 distinct states found (2,257,606 ds/min), 158,295,507 states left on queue.
+Progress(40) at 2024-11-06 20:38:57: 9,036,450,953 states generated (29,165,278 s/min), 806,795,860 distinct states found (2,545,836 ds/min), 158,767,907 states left on queue.
+Progress(40) at 2024-11-06 20:39:57: 9,065,704,268 states generated (29,253,315 s/min), 809,198,438 distinct states found (2,402,578 ds/min), 159,105,121 states left on queue.
+Progress(40) at 2024-11-06 20:40:57: 9,095,165,427 states generated (29,461,159 s/min), 811,512,584 distinct states found (2,314,146 ds/min), 159,345,117 states left on queue.
+Progress(40) at 2024-11-06 20:41:57: 9,124,541,297 states generated (29,375,870 s/min), 813,905,920 distinct states found (2,393,336 ds/min), 159,672,325 states left on queue.
+Progress(40) at 2024-11-06 20:42:57: 9,153,712,591 states generated (29,171,294 s/min), 816,392,570 distinct states found (2,486,650 ds/min), 160,082,547 states left on queue.
+Progress(40) at 2024-11-06 20:43:57: 9,182,920,866 states generated (29,208,275 s/min), 818,845,538 distinct states found (2,452,968 ds/min), 160,476,056 states left on queue.
+Progress(40) at 2024-11-06 20:44:57: 9,212,093,614 states generated (29,172,748 s/min), 821,212,595 distinct states found (2,367,057 ds/min), 160,787,698 states left on queue.
+Progress(40) at 2024-11-06 20:45:57: 9,241,177,362 states generated (29,083,748 s/min), 823,731,111 distinct states found (2,518,516 ds/min), 161,227,975 states left on queue.
+Progress(40) at 2024-11-06 20:46:57: 9,270,666,448 states generated (29,489,086 s/min), 825,877,262 distinct states found (2,146,151 ds/min), 161,339,209 states left on queue.
+Progress(40) at 2024-11-06 20:47:57: 9,299,985,513 states generated (29,319,065 s/min), 828,195,512 distinct states found (2,318,250 ds/min), 161,644,069 states left on queue.
+Progress(40) at 2024-11-06 20:48:57: 9,329,155,005 states generated (29,169,492 s/min), 830,386,518 distinct states found (2,191,006 ds/min), 161,807,802 states left on queue.
+Progress(40) at 2024-11-06 20:49:57: 9,358,433,771 states generated (29,278,766 s/min), 832,419,931 distinct states found (2,033,413 ds/min), 161,882,018 states left on queue.
+Progress(40) at 2024-11-06 20:50:57: 9,387,665,287 states generated (29,231,516 s/min), 834,751,267 distinct states found (2,331,336 ds/min), 162,183,217 states left on queue.
+Progress(40) at 2024-11-06 20:51:57: 9,416,697,647 states generated (29,032,360 s/min), 837,127,657 distinct states found (2,376,390 ds/min), 162,511,558 states left on queue.
+Progress(40) at 2024-11-06 20:52:57: 9,445,747,666 states generated (29,050,019 s/min), 839,556,372 distinct states found (2,428,715 ds/min), 162,873,418 states left on queue.
+Progress(40) at 2024-11-06 20:53:57: 9,474,599,613 states generated (28,851,947 s/min), 841,985,780 distinct states found (2,429,408 ds/min), 163,231,531 states left on queue.
+Progress(40) at 2024-11-06 20:54:57: 9,503,408,525 states generated (28,808,912 s/min), 844,368,680 distinct states found (2,382,900 ds/min), 163,533,407 states left on queue.
+Progress(40) at 2024-11-06 20:55:57: 9,532,128,492 states generated (28,719,967 s/min), 846,804,519 distinct states found (2,435,839 ds/min), 163,787,695 states left on queue.
+Progress(40) at 2024-11-06 20:56:57: 9,560,935,598 states generated (28,807,106 s/min), 849,075,143 distinct states found (2,270,624 ds/min), 163,946,240 states left on queue.
+Progress(40) at 2024-11-06 20:57:57: 9,590,127,374 states generated (29,191,776 s/min), 851,260,378 distinct states found (2,185,235 ds/min), 164,077,372 states left on queue.
+Progress(40) at 2024-11-06 20:58:57: 9,619,514,341 states generated (29,386,967 s/min), 853,352,738 distinct states found (2,092,360 ds/min), 164,118,186 states left on queue.
+Progress(40) at 2024-11-06 20:59:57: 9,648,985,302 states generated (29,470,961 s/min), 855,543,408 distinct states found (2,190,670 ds/min), 164,279,076 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:00:58)
+Progress(40) at 2024-11-06 21:00:58: 9,678,677,722 states generated (29,692,420 s/min), 857,894,775 distinct states found (2,351,367 ds/min), 164,516,395 states left on queue.
+Progress(40) at 2024-11-06 21:01:58: 9,708,095,509 states generated (29,417,787 s/min), 860,383,155 distinct states found (2,488,380 ds/min), 164,898,153 states left on queue.
+Progress(40) at 2024-11-06 21:02:58: 9,737,488,378 states generated (29,392,869 s/min), 862,497,194 distinct states found (2,114,039 ds/min), 164,966,010 states left on queue.
+Progress(40) at 2024-11-06 21:03:58: 9,766,895,552 states generated (29,407,174 s/min), 864,819,525 distinct states found (2,322,331 ds/min), 165,232,701 states left on queue.
+Progress(40) at 2024-11-06 21:04:58: 9,796,208,300 states generated (29,312,748 s/min), 867,276,841 distinct states found (2,457,316 ds/min), 165,568,613 states left on queue.
+Progress(40) at 2024-11-06 21:05:58: 9,825,603,726 states generated (29,395,426 s/min), 869,434,526 distinct states found (2,157,685 ds/min), 165,685,610 states left on queue.
+Progress(40) at 2024-11-06 21:06:58: 9,854,789,772 states generated (29,186,046 s/min), 871,934,034 distinct states found (2,499,508 ds/min), 166,084,000 states left on queue.
+Progress(40) at 2024-11-06 21:07:58: 9,884,028,390 states generated (29,238,618 s/min), 874,443,659 distinct states found (2,509,625 ds/min), 166,483,652 states left on queue.
+Progress(40) at 2024-11-06 21:08:58: 9,913,377,669 states generated (29,349,279 s/min), 876,803,913 distinct states found (2,360,254 ds/min), 166,740,702 states left on queue.
+Progress(40) at 2024-11-06 21:09:58: 9,942,721,749 states generated (29,344,080 s/min), 879,187,270 distinct states found (2,383,357 ds/min), 166,953,562 states left on queue.
+Progress(41) at 2024-11-06 21:10:58: 9,972,078,704 states generated (29,356,955 s/min), 881,233,361 distinct states found (2,046,091 ds/min), 166,999,841 states left on queue.
+Progress(41) at 2024-11-06 21:11:58: 10,000,914,792 states generated (28,836,088 s/min), 883,811,441 distinct states found (2,578,080 ds/min), 167,466,583 states left on queue.
+Progress(41) at 2024-11-06 21:12:58: 10,030,210,434 states generated (29,295,642 s/min), 885,899,950 distinct states found (2,088,509 ds/min), 167,531,826 states left on queue.
+Progress(41) at 2024-11-06 21:13:58: 10,059,587,070 states generated (29,376,636 s/min), 888,188,669 distinct states found (2,288,719 ds/min), 167,753,242 states left on queue.
+Progress(41) at 2024-11-06 21:14:58: 10,089,078,901 states generated (29,491,831 s/min), 890,649,997 distinct states found (2,461,328 ds/min), 168,098,890 states left on queue.
+Progress(41) at 2024-11-06 21:15:58: 10,118,348,352 states generated (29,269,451 s/min), 892,695,892 distinct states found (2,045,895 ds/min), 168,141,532 states left on queue.
+Progress(41) at 2024-11-06 21:16:58: 10,147,644,676 states generated (29,296,324 s/min), 894,823,997 distinct states found (2,128,105 ds/min), 168,231,032 states left on queue.
+Progress(41) at 2024-11-06 21:17:58: 10,176,967,773 states generated (29,323,097 s/min), 897,225,523 distinct states found (2,401,526 ds/min), 168,555,740 states left on queue.
+Progress(41) at 2024-11-06 21:18:58: 10,206,275,174 states generated (29,307,401 s/min), 899,814,626 distinct states found (2,589,103 ds/min), 169,020,971 states left on queue.
+Progress(41) at 2024-11-06 21:19:58: 10,235,593,993 states generated (29,318,819 s/min), 902,141,356 distinct states found (2,326,730 ds/min), 169,267,251 states left on queue.
+Progress(41) at 2024-11-06 21:20:58: 10,264,799,049 states generated (29,205,056 s/min), 904,746,333 distinct states found (2,604,977 ds/min), 169,758,459 states left on queue.
+Progress(41) at 2024-11-06 21:21:58: 10,293,910,586 states generated (29,111,537 s/min), 907,433,182 distinct states found (2,686,849 ds/min), 170,277,176 states left on queue.
+Progress(41) at 2024-11-06 21:22:58: 10,323,190,750 states generated (29,280,164 s/min), 910,052,108 distinct states found (2,618,926 ds/min), 170,695,212 states left on queue.
+Progress(41) at 2024-11-06 21:23:58: 10,352,580,182 states generated (29,389,432 s/min), 912,516,064 distinct states found (2,463,956 ds/min), 171,083,771 states left on queue.
+Progress(41) at 2024-11-06 21:24:58: 10,381,951,479 states generated (29,371,297 s/min), 914,781,443 distinct states found (2,265,379 ds/min), 171,281,545 states left on queue.
+Progress(41) at 2024-11-06 21:25:58: 10,411,026,945 states generated (29,075,466 s/min), 917,078,052 distinct states found (2,296,609 ds/min), 171,498,613 states left on queue.
+Progress(41) at 2024-11-06 21:26:58: 10,439,904,441 states generated (28,877,496 s/min), 919,547,808 distinct states found (2,469,756 ds/min), 171,860,589 states left on queue.
+Progress(41) at 2024-11-06 21:27:58: 10,469,008,600 states generated (29,104,159 s/min), 921,912,547 distinct states found (2,364,739 ds/min), 172,121,551 states left on queue.
+Progress(41) at 2024-11-06 21:28:58: 10,497,834,986 states generated (28,826,386 s/min), 924,235,840 distinct states found (2,323,293 ds/min), 172,353,661 states left on queue.
+Progress(41) at 2024-11-06 21:29:58: 10,527,064,696 states generated (29,229,710 s/min), 926,456,744 distinct states found (2,220,904 ds/min), 172,508,439 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:30:59)
+Progress(41) at 2024-11-06 21:30:59: 10,556,579,142 states generated (29,514,446 s/min), 928,988,872 distinct states found (2,532,128 ds/min), 172,833,183 states left on queue.
+Progress(41) at 2024-11-06 21:31:59: 10,585,719,909 states generated (29,140,767 s/min), 930,745,149 distinct states found (1,756,277 ds/min), 172,622,496 states left on queue.
+Progress(41) at 2024-11-06 21:32:59: 10,614,881,115 states generated (29,161,206 s/min), 932,948,083 distinct states found (2,202,934 ds/min), 172,792,818 states left on queue.
+Progress(41) at 2024-11-06 21:33:59: 10,643,693,909 states generated (28,812,794 s/min), 935,441,862 distinct states found (2,493,779 ds/min), 173,119,721 states left on queue.
+Progress(41) at 2024-11-06 21:34:59: 10,672,671,166 states generated (28,977,257 s/min), 937,653,961 distinct states found (2,212,099 ds/min), 173,216,843 states left on queue.
+Progress(41) at 2024-11-06 21:35:59: 10,702,072,440 states generated (29,401,274 s/min), 939,638,920 distinct states found (1,984,959 ds/min), 173,254,076 states left on queue.
+Progress(41) at 2024-11-06 21:36:59: 10,731,415,292 states generated (29,342,852 s/min), 941,583,653 distinct states found (1,944,733 ds/min), 173,229,968 states left on queue.
+Progress(41) at 2024-11-06 21:37:59: 10,760,802,656 states generated (29,387,364 s/min), 943,770,610 distinct states found (2,186,957 ds/min), 173,412,799 states left on queue.
+Progress(41) at 2024-11-06 21:38:59: 10,789,961,996 states generated (29,159,340 s/min), 945,790,519 distinct states found (2,019,909 ds/min), 173,482,204 states left on queue.
+Progress(41) at 2024-11-06 21:39:59: 10,819,303,972 states generated (29,341,976 s/min), 947,902,156 distinct states found (2,111,637 ds/min), 173,640,941 states left on queue.
+Progress(41) at 2024-11-06 21:40:59: 10,848,636,471 states generated (29,332,499 s/min), 949,908,145 distinct states found (2,005,989 ds/min), 173,684,074 states left on queue.
+Progress(41) at 2024-11-06 21:41:59: 10,878,207,345 states generated (29,570,874 s/min), 951,870,784 distinct states found (1,962,639 ds/min), 173,648,255 states left on queue.
+Progress(41) at 2024-11-06 21:42:59: 10,907,777,091 states generated (29,569,746 s/min), 954,123,321 distinct states found (2,252,537 ds/min), 173,881,583 states left on queue.
+Progress(41) at 2024-11-06 21:43:59: 10,937,383,465 states generated (29,606,374 s/min), 956,486,701 distinct states found (2,363,380 ds/min), 174,173,694 states left on queue.
+Progress(41) at 2024-11-06 21:44:59: 10,967,070,713 states generated (29,687,248 s/min), 958,539,717 distinct states found (2,053,016 ds/min), 174,194,592 states left on queue.
+Progress(41) at 2024-11-06 21:45:59: 10,996,524,132 states generated (29,453,419 s/min), 960,439,766 distinct states found (1,900,049 ds/min), 174,165,777 states left on queue.
+Progress(41) at 2024-11-06 21:46:59: 11,025,919,452 states generated (29,395,320 s/min), 962,518,661 distinct states found (2,078,895 ds/min), 174,284,642 states left on queue.
+Progress(41) at 2024-11-06 21:47:59: 11,055,087,136 states generated (29,167,684 s/min), 964,440,130 distinct states found (1,921,469 ds/min), 174,253,951 states left on queue.
+Progress(41) at 2024-11-06 21:48:59: 11,084,346,164 states generated (29,259,028 s/min), 966,652,841 distinct states found (2,212,711 ds/min), 174,452,762 states left on queue.
+Progress(41) at 2024-11-06 21:49:59: 11,113,503,996 states generated (29,157,832 s/min), 968,786,590 distinct states found (2,133,749 ds/min), 174,578,147 states left on queue.
+Progress(41) at 2024-11-06 21:50:59: 11,142,862,327 states generated (29,358,331 s/min), 970,780,918 distinct states found (1,994,328 ds/min), 174,585,050 states left on queue.
+Progress(41) at 2024-11-06 21:51:59: 11,171,907,560 states generated (29,045,233 s/min), 972,924,432 distinct states found (2,143,514 ds/min), 174,718,189 states left on queue.
+Progress(41) at 2024-11-06 21:52:59: 11,201,055,602 states generated (29,148,042 s/min), 975,106,131 distinct states found (2,181,699 ds/min), 174,874,035 states left on queue.
+Progress(41) at 2024-11-06 21:53:59: 11,230,576,268 states generated (29,520,666 s/min), 977,176,048 distinct states found (2,069,917 ds/min), 175,042,666 states left on queue.
+Progress(41) at 2024-11-06 21:54:59: 11,259,928,257 states generated (29,351,989 s/min), 979,337,351 distinct states found (2,161,303 ds/min), 175,248,665 states left on queue.
+Progress(41) at 2024-11-06 21:55:59: 11,289,190,366 states generated (29,262,109 s/min), 981,837,130 distinct states found (2,499,779 ds/min), 175,680,736 states left on queue.
+Progress(41) at 2024-11-06 21:56:59: 11,318,399,828 states generated (29,209,462 s/min), 984,112,195 distinct states found (2,275,065 ds/min), 175,913,580 states left on queue.
+Progress(41) at 2024-11-06 21:57:59: 11,347,862,845 states generated (29,463,017 s/min), 986,368,069 distinct states found (2,255,874 ds/min), 176,126,523 states left on queue.
+Progress(41) at 2024-11-06 21:58:59: 11,377,318,937 states generated (29,456,092 s/min), 988,548,686 distinct states found (2,180,617 ds/min), 176,253,552 states left on queue.
+Progress(41) at 2024-11-06 21:59:59: 11,406,551,913 states generated (29,232,976 s/min), 990,875,071 distinct states found (2,326,385 ds/min), 176,528,465 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:00:59)
+Progress(41) at 2024-11-06 22:00:59: 11,436,006,666 states generated (29,454,753 s/min), 993,234,999 distinct states found (2,359,928 ds/min), 176,816,755 states left on queue.
+Progress(41) at 2024-11-06 22:01:59: 11,465,207,151 states generated (29,200,485 s/min), 995,557,179 distinct states found (2,322,180 ds/min), 177,094,397 states left on queue.
+Progress(41) at 2024-11-06 22:02:59: 11,494,298,575 states generated (29,091,424 s/min), 997,927,812 distinct states found (2,370,633 ds/min), 177,411,890 states left on queue.
+Progress(41) at 2024-11-06 22:03:59: 11,523,576,632 states generated (29,278,057 s/min), 1,000,196,030 distinct states found (2,268,218 ds/min), 177,640,656 states left on queue.
+Progress(41) at 2024-11-06 22:04:59: 11,552,734,483 states generated (29,157,851 s/min), 1,002,452,277 distinct states found (2,256,247 ds/min), 177,827,247 states left on queue.
+Progress(41) at 2024-11-06 22:05:59: 11,582,200,298 states generated (29,465,815 s/min), 1,004,593,818 distinct states found (2,141,541 ds/min), 177,983,707 states left on queue.
+Progress(41) at 2024-11-06 22:06:59: 11,611,484,149 states generated (29,283,851 s/min), 1,006,774,383 distinct states found (2,180,565 ds/min), 178,161,577 states left on queue.
+Progress(41) at 2024-11-06 22:07:59: 11,640,449,232 states generated (28,965,083 s/min), 1,008,870,356 distinct states found (2,095,973 ds/min), 178,245,657 states left on queue.
+Progress(41) at 2024-11-06 22:08:59: 11,669,695,402 states generated (29,246,170 s/min), 1,010,743,262 distinct states found (1,872,906 ds/min), 178,199,630 states left on queue.
+Progress(41) at 2024-11-06 22:09:59: 11,698,855,657 states generated (29,160,255 s/min), 1,012,993,163 distinct states found (2,249,901 ds/min), 178,433,806 states left on queue.
+Progress(41) at 2024-11-06 22:10:59: 11,727,873,536 states generated (29,017,879 s/min), 1,015,222,628 distinct states found (2,229,465 ds/min), 178,645,315 states left on queue.
+Progress(41) at 2024-11-06 22:11:59: 11,756,910,696 states generated (29,037,160 s/min), 1,017,493,811 distinct states found (2,271,183 ds/min), 178,885,854 states left on queue.
+Progress(41) at 2024-11-06 22:12:59: 11,785,841,957 states generated (28,931,261 s/min), 1,019,798,730 distinct states found (2,304,919 ds/min), 179,138,831 states left on queue.
+Progress(41) at 2024-11-06 22:13:59: 11,814,627,351 states generated (28,785,394 s/min), 1,022,115,935 distinct states found (2,317,205 ds/min), 179,401,355 states left on queue.
+Progress(41) at 2024-11-06 22:14:59: 11,843,482,288 states generated (28,854,937 s/min), 1,024,372,991 distinct states found (2,257,056 ds/min), 179,570,167 states left on queue.
+Progress(41) at 2024-11-06 22:15:59: 11,872,232,503 states generated (28,750,215 s/min), 1,026,655,919 distinct states found (2,282,928 ds/min), 179,704,400 states left on queue.
+Progress(41) at 2024-11-06 22:16:59: 11,901,011,327 states generated (28,778,824 s/min), 1,028,780,151 distinct states found (2,124,232 ds/min), 179,744,822 states left on queue.
+Progress(41) at 2024-11-06 22:17:59: 11,930,078,061 states generated (29,066,734 s/min), 1,030,863,673 distinct states found (2,083,522 ds/min), 179,790,662 states left on queue.
+Progress(41) at 2024-11-06 22:18:59: 11,959,463,901 states generated (29,385,840 s/min), 1,032,840,344 distinct states found (1,976,671 ds/min), 179,738,442 states left on queue.
+Progress(41) at 2024-11-06 22:19:59: 11,988,811,132 states generated (29,347,231 s/min), 1,034,897,049 distinct states found (2,056,705 ds/min), 179,788,782 states left on queue.
+Progress(41) at 2024-11-06 22:20:59: 12,018,335,911 states generated (29,524,779 s/min), 1,037,158,579 distinct states found (2,261,530 ds/min), 179,978,226 states left on queue.
+Progress(41) at 2024-11-06 22:21:59: 12,047,755,593 states generated (29,419,682 s/min), 1,039,437,623 distinct states found (2,279,044 ds/min), 180,177,371 states left on queue.
+Progress(41) at 2024-11-06 22:22:59: 12,077,111,001 states generated (29,355,408 s/min), 1,041,672,961 distinct states found (2,235,338 ds/min), 180,336,777 states left on queue.
+Progress(41) at 2024-11-06 22:23:59: 12,106,556,177 states generated (29,445,176 s/min), 1,043,675,880 distinct states found (2,002,919 ds/min), 180,345,759 states left on queue.
+Progress(41) at 2024-11-06 22:24:59: 12,135,797,446 states generated (29,241,269 s/min), 1,045,966,606 distinct states found (2,290,726 ds/min), 180,552,887 states left on queue.
+Progress(41) at 2024-11-06 22:25:59: 12,165,143,756 states generated (29,346,310 s/min), 1,048,373,643 distinct states found (2,407,037 ds/min), 180,860,142 states left on queue.
+Progress(41) at 2024-11-06 22:26:59: 12,194,478,236 states generated (29,334,480 s/min), 1,050,403,560 distinct states found (2,029,917 ds/min), 180,873,811 states left on queue.
+Progress(41) at 2024-11-06 22:27:59: 12,223,653,080 states generated (29,174,844 s/min), 1,052,798,502 distinct states found (2,394,942 ds/min), 181,184,025 states left on queue.
+Progress(41) at 2024-11-06 22:28:59: 12,252,926,784 states generated (29,273,704 s/min), 1,055,243,990 distinct states found (2,445,488 ds/min), 181,542,525 states left on queue.
+Progress(41) at 2024-11-06 22:29:59: 12,282,176,071 states generated (29,249,287 s/min), 1,057,488,489 distinct states found (2,244,499 ds/min), 181,704,266 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:31:00)
+Progress(41) at 2024-11-06 22:31:00: 12,311,654,529 states generated (29,478,458 s/min), 1,059,789,296 distinct states found (2,300,807 ds/min), 181,875,392 states left on queue.
+Progress(41) at 2024-11-06 22:32:00: 12,340,837,903 states generated (29,183,374 s/min), 1,061,857,294 distinct states found (2,067,998 ds/min), 181,860,690 states left on queue.
+Progress(41) at 2024-11-06 22:33:00: 12,369,978,352 states generated (29,140,449 s/min), 1,063,943,173 distinct states found (2,085,879 ds/min), 181,951,091 states left on queue.
+Progress(41) at 2024-11-06 22:34:00: 12,398,820,660 states generated (28,842,308 s/min), 1,066,384,327 distinct states found (2,441,154 ds/min), 182,284,376 states left on queue.
+Progress(41) at 2024-11-06 22:35:00: 12,427,966,245 states generated (29,145,585 s/min), 1,068,376,116 distinct states found (1,991,789 ds/min), 182,275,982 states left on queue.
+Progress(41) at 2024-11-06 22:36:00: 12,457,300,671 states generated (29,334,426 s/min), 1,070,596,949 distinct states found (2,220,833 ds/min), 182,442,278 states left on queue.
+Progress(41) at 2024-11-06 22:37:00: 12,486,769,483 states generated (29,468,812 s/min), 1,072,968,640 distinct states found (2,371,691 ds/min), 182,718,485 states left on queue.
+Progress(41) at 2024-11-06 22:38:00: 12,516,031,360 states generated (29,261,877 s/min), 1,075,001,378 distinct states found (2,032,738 ds/min), 182,729,966 states left on queue.
+Progress(41) at 2024-11-06 22:39:00: 12,545,265,331 states generated (29,233,971 s/min), 1,076,880,794 distinct states found (1,879,416 ds/min), 182,634,798 states left on queue.
+Progress(41) at 2024-11-06 22:40:00: 12,574,495,559 states generated (29,230,228 s/min), 1,079,123,856 distinct states found (2,243,062 ds/min), 182,812,322 states left on queue.
+Progress(41) at 2024-11-06 22:41:00: 12,603,757,387 states generated (29,261,828 s/min), 1,081,610,769 distinct states found (2,486,913 ds/min), 183,219,247 states left on queue.
+Progress(41) at 2024-11-06 22:42:00: 12,632,909,026 states generated (29,151,639 s/min), 1,083,967,637 distinct states found (2,356,868 ds/min), 183,478,879 states left on queue.
+Progress(41) at 2024-11-06 22:43:00: 12,662,254,981 states generated (29,345,955 s/min), 1,086,272,935 distinct states found (2,305,298 ds/min), 183,726,701 states left on queue.
+Progress(41) at 2024-11-06 22:44:00: 12,691,400,218 states generated (29,145,237 s/min), 1,088,778,928 distinct states found (2,505,993 ds/min), 184,128,274 states left on queue.
+Progress(41) at 2024-11-06 22:45:00: 12,720,528,098 states generated (29,127,880 s/min), 1,091,335,929 distinct states found (2,557,001 ds/min), 184,556,078 states left on queue.
+Progress(41) at 2024-11-06 22:46:00: 12,749,701,886 states generated (29,173,788 s/min), 1,093,889,510 distinct states found (2,553,581 ds/min), 184,916,391 states left on queue.
+Progress(41) at 2024-11-06 22:47:00: 12,779,153,937 states generated (29,452,051 s/min), 1,096,185,973 distinct states found (2,296,463 ds/min), 185,115,877 states left on queue.
+Progress(41) at 2024-11-06 22:48:00: 12,808,440,971 states generated (29,287,034 s/min), 1,098,733,865 distinct states found (2,547,892 ds/min), 185,564,617 states left on queue.
+Progress(41) at 2024-11-06 22:49:00: 12,837,695,256 states generated (29,254,285 s/min), 1,100,705,460 distinct states found (1,971,595 ds/min), 185,532,558 states left on queue.
+Progress(41) at 2024-11-06 22:50:00: 12,866,801,129 states generated (29,105,873 s/min), 1,103,074,603 distinct states found (2,369,143 ds/min), 185,770,427 states left on queue.
+Progress(41) at 2024-11-06 22:51:00: 12,895,682,870 states generated (28,881,741 s/min), 1,105,437,747 distinct states found (2,363,144 ds/min), 186,049,274 states left on queue.
+Progress(41) at 2024-11-06 22:52:00: 12,924,655,990 states generated (28,973,120 s/min), 1,107,853,554 distinct states found (2,415,807 ds/min), 186,325,129 states left on queue.
+Progress(41) at 2024-11-06 22:53:00: 12,953,616,826 states generated (28,960,836 s/min), 1,110,097,321 distinct states found (2,243,767 ds/min), 186,509,276 states left on queue.
+Progress(41) at 2024-11-06 22:54:00: 12,982,711,068 states generated (29,094,242 s/min), 1,112,146,097 distinct states found (2,048,776 ds/min), 186,507,356 states left on queue.
+Progress(41) at 2024-11-06 22:55:00: 13,011,962,667 states generated (29,251,599 s/min), 1,114,530,785 distinct states found (2,384,688 ds/min), 186,758,016 states left on queue.
+Progress(41) at 2024-11-06 22:56:00: 13,041,163,382 states generated (29,200,715 s/min), 1,116,566,038 distinct states found (2,035,253 ds/min), 186,702,453 states left on queue.
+Progress(41) at 2024-11-06 22:57:00: 13,070,416,604 states generated (29,253,222 s/min), 1,118,433,735 distinct states found (1,867,697 ds/min), 186,595,926 states left on queue.
+Progress(41) at 2024-11-06 22:58:00: 13,099,393,765 states generated (28,977,161 s/min), 1,120,727,626 distinct states found (2,293,891 ds/min), 186,785,521 states left on queue.
+Progress(41) at 2024-11-06 22:59:00: 13,128,309,003 states generated (28,915,238 s/min), 1,123,075,278 distinct states found (2,347,652 ds/min), 186,977,496 states left on queue.
+Progress(42) at 2024-11-06 23:00:00: 13,157,492,254 states generated (29,183,251 s/min), 1,125,164,050 distinct states found (2,088,772 ds/min), 186,994,591 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:01:01)
+Progress(42) at 2024-11-06 23:01:01: 13,187,099,442 states generated (29,607,188 s/min), 1,126,955,828 distinct states found (1,791,778 ds/min), 186,860,457 states left on queue.
+Progress(42) at 2024-11-06 23:02:01: 13,216,408,249 states generated (29,308,807 s/min), 1,128,852,586 distinct states found (1,896,758 ds/min), 186,800,218 states left on queue.
+Progress(42) at 2024-11-06 23:03:01: 13,245,736,139 states generated (29,327,890 s/min), 1,130,960,381 distinct states found (2,107,795 ds/min), 186,911,118 states left on queue.
+Progress(42) at 2024-11-06 23:04:01: 13,274,893,464 states generated (29,157,325 s/min), 1,132,863,930 distinct states found (1,903,549 ds/min), 186,892,129 states left on queue.
+Progress(42) at 2024-11-06 23:05:01: 13,304,183,990 states generated (29,290,526 s/min), 1,134,876,306 distinct states found (2,012,376 ds/min), 186,965,153 states left on queue.
+Progress(42) at 2024-11-06 23:06:01: 13,333,457,770 states generated (29,273,780 s/min), 1,136,812,237 distinct states found (1,935,931 ds/min), 186,957,506 states left on queue.
+Progress(42) at 2024-11-06 23:07:01: 13,362,984,994 states generated (29,527,224 s/min), 1,138,649,876 distinct states found (1,837,639 ds/min), 186,823,887 states left on queue.
+Progress(42) at 2024-11-06 23:08:01: 13,392,550,733 states generated (29,565,739 s/min), 1,140,795,722 distinct states found (2,145,846 ds/min), 186,974,795 states left on queue.
+Progress(42) at 2024-11-06 23:09:01: 13,422,111,300 states generated (29,560,567 s/min), 1,143,038,611 distinct states found (2,242,889 ds/min), 187,179,197 states left on queue.
+Progress(42) at 2024-11-06 23:10:01: 13,451,822,496 states generated (29,711,196 s/min), 1,145,071,502 distinct states found (2,032,891 ds/min), 187,190,480 states left on queue.
+Progress(42) at 2024-11-06 23:11:01: 13,481,293,484 states generated (29,470,988 s/min), 1,146,905,806 distinct states found (1,834,304 ds/min), 187,079,661 states left on queue.
+Progress(42) at 2024-11-06 23:12:01: 13,510,659,679 states generated (29,366,195 s/min), 1,148,841,643 distinct states found (1,935,837 ds/min), 187,082,815 states left on queue.
+Progress(42) at 2024-11-06 23:13:01: 13,539,730,883 states generated (29,071,204 s/min), 1,150,715,436 distinct states found (1,873,793 ds/min), 187,013,975 states left on queue.
+Progress(42) at 2024-11-06 23:14:01: 13,568,973,308 states generated (29,242,425 s/min), 1,152,689,735 distinct states found (1,974,299 ds/min), 187,016,208 states left on queue.
+Progress(42) at 2024-11-06 23:15:01: 13,598,106,627 states generated (29,133,319 s/min), 1,154,829,869 distinct states found (2,140,134 ds/min), 187,147,884 states left on queue.
+Progress(42) at 2024-11-06 23:16:01: 13,627,319,459 states generated (29,212,832 s/min), 1,156,740,070 distinct states found (1,910,201 ds/min), 187,086,942 states left on queue.
+Progress(42) at 2024-11-06 23:17:01: 13,656,462,121 states generated (29,142,662 s/min), 1,158,698,307 distinct states found (1,958,237 ds/min), 187,072,201 states left on queue.
+Progress(42) at 2024-11-06 23:18:01: 13,685,545,941 states generated (29,083,820 s/min), 1,160,688,939 distinct states found (1,990,632 ds/min), 187,078,553 states left on queue.
+Progress(42) at 2024-11-06 23:19:01: 13,714,652,628 states generated (29,106,687 s/min), 1,162,748,633 distinct states found (2,059,694 ds/min), 187,157,229 states left on queue.
+Progress(42) at 2024-11-06 23:20:01: 13,744,105,986 states generated (29,453,358 s/min), 1,164,748,782 distinct states found (2,000,149 ds/min), 187,275,480 states left on queue.
+Progress(42) at 2024-11-06 23:21:01: 13,773,414,393 states generated (29,308,407 s/min), 1,166,804,740 distinct states found (2,055,958 ds/min), 187,393,312 states left on queue.
+Progress(42) at 2024-11-06 23:22:01: 13,802,600,069 states generated (29,185,676 s/min), 1,169,251,493 distinct states found (2,446,753 ds/min), 187,781,298 states left on queue.
+Progress(42) at 2024-11-06 23:23:01: 13,831,830,649 states generated (29,230,580 s/min), 1,171,412,176 distinct states found (2,160,683 ds/min), 187,932,991 states left on queue.
+Progress(42) at 2024-11-06 23:24:01: 13,861,152,221 states generated (29,321,572 s/min), 1,173,582,994 distinct states found (2,170,818 ds/min), 188,078,037 states left on queue.
+Progress(42) at 2024-11-06 23:25:01: 13,890,538,756 states generated (29,386,535 s/min), 1,175,642,901 distinct states found (2,059,907 ds/min), 188,116,794 states left on queue.
+Progress(42) at 2024-11-06 23:26:01: 13,919,812,820 states generated (29,274,064 s/min), 1,177,743,048 distinct states found (2,100,147 ds/min), 188,189,399 states left on queue.
+Progress(42) at 2024-11-06 23:27:01: 13,948,903,585 states generated (29,090,765 s/min), 1,179,980,470 distinct states found (2,237,422 ds/min), 188,388,309 states left on queue.
+Progress(42) at 2024-11-06 23:28:01: 13,978,138,385 states generated (29,234,800 s/min), 1,182,134,981 distinct states found (2,154,511 ds/min), 188,526,735 states left on queue.
+Progress(42) at 2024-11-06 23:29:01: 14,007,310,151 states generated (29,171,766 s/min), 1,184,360,360 distinct states found (2,225,379 ds/min), 188,718,575 states left on queue.
+Progress(42) at 2024-11-06 23:30:01: 14,036,411,110 states generated (29,100,959 s/min), 1,186,617,835 distinct states found (2,257,475 ds/min), 188,941,068 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:31:01)
+Progress(42) at 2024-11-06 23:31:01: 14,065,894,113 states generated (29,483,003 s/min), 1,188,743,048 distinct states found (2,125,213 ds/min), 189,035,636 states left on queue.
+Progress(42) at 2024-11-06 23:32:01: 14,094,909,096 states generated (29,014,983 s/min), 1,191,096,961 distinct states found (2,353,913 ds/min), 189,332,174 states left on queue.
+Progress(42) at 2024-11-06 23:33:01: 14,124,212,567 states generated (29,303,471 s/min), 1,193,012,997 distinct states found (1,916,036 ds/min), 189,266,016 states left on queue.
+Progress(42) at 2024-11-06 23:34:01: 14,153,428,768 states generated (29,216,201 s/min), 1,195,170,448 distinct states found (2,157,451 ds/min), 189,430,881 states left on queue.
+Progress(42) at 2024-11-06 23:35:01: 14,182,568,290 states generated (29,139,522 s/min), 1,197,127,126 distinct states found (1,956,678 ds/min), 189,423,769 states left on queue.
+Progress(42) at 2024-11-06 23:36:01: 14,211,602,024 states generated (29,033,734 s/min), 1,199,044,612 distinct states found (1,917,486 ds/min), 189,380,199 states left on queue.
+Progress(42) at 2024-11-06 23:37:01: 14,240,593,845 states generated (28,991,821 s/min), 1,200,900,028 distinct states found (1,855,416 ds/min), 189,324,925 states left on queue.
+Progress(42) at 2024-11-06 23:38:01: 14,269,687,808 states generated (29,093,963 s/min), 1,203,034,598 distinct states found (2,134,570 ds/min), 189,466,947 states left on queue.
+Progress(42) at 2024-11-06 23:39:01: 14,298,626,140 states generated (28,938,332 s/min), 1,205,190,806 distinct states found (2,156,208 ds/min), 189,608,794 states left on queue.
+Progress(42) at 2024-11-06 23:40:01: 14,327,587,116 states generated (28,960,976 s/min), 1,207,339,559 distinct states found (2,148,753 ds/min), 189,750,359 states left on queue.
+Progress(42) at 2024-11-06 23:41:01: 14,356,469,494 states generated (28,882,378 s/min), 1,209,518,146 distinct states found (2,178,587 ds/min), 189,892,036 states left on queue.
+Progress(42) at 2024-11-06 23:42:01: 14,385,314,696 states generated (28,845,202 s/min), 1,211,701,473 distinct states found (2,183,327 ds/min), 190,050,090 states left on queue.
+Progress(42) at 2024-11-06 23:43:01: 14,414,142,550 states generated (28,827,854 s/min), 1,213,859,919 distinct states found (2,158,446 ds/min), 190,161,804 states left on queue.
+Progress(42) at 2024-11-06 23:44:01: 14,442,945,644 states generated (28,803,094 s/min), 1,216,005,127 distinct states found (2,145,208 ds/min), 190,173,898 states left on queue.
+Progress(42) at 2024-11-06 23:45:01: 14,471,693,798 states generated (28,748,154 s/min), 1,218,030,292 distinct states found (2,025,165 ds/min), 190,127,864 states left on queue.
+Progress(42) at 2024-11-06 23:46:01: 14,500,599,025 states generated (28,905,227 s/min), 1,219,996,243 distinct states found (1,965,951 ds/min), 190,069,034 states left on queue.
+Progress(42) at 2024-11-06 23:47:01: 14,529,770,118 states generated (29,171,093 s/min), 1,221,890,284 distinct states found (1,894,041 ds/min), 189,948,701 states left on queue.
+Progress(42) at 2024-11-06 23:48:01: 14,559,044,399 states generated (29,274,281 s/min), 1,223,772,100 distinct states found (1,881,816 ds/min), 189,844,417 states left on queue.
+Progress(42) at 2024-11-06 23:49:01: 14,588,505,088 states generated (29,460,689 s/min), 1,225,870,790 distinct states found (2,098,690 ds/min), 189,921,025 states left on queue.
+Progress(42) at 2024-11-06 23:50:01: 14,618,007,797 states generated (29,502,709 s/min), 1,227,944,381 distinct states found (2,073,591 ds/min), 189,947,590 states left on queue.
+Progress(42) at 2024-11-06 23:51:01: 14,647,405,532 states generated (29,397,735 s/min), 1,230,287,712 distinct states found (2,343,331 ds/min), 190,200,223 states left on queue.
+Progress(42) at 2024-11-06 23:52:01: 14,676,733,478 states generated (29,327,946 s/min), 1,232,303,440 distinct states found (2,015,728 ds/min), 190,178,290 states left on queue.
+Progress(42) at 2024-11-06 23:53:01: 14,706,089,483 states generated (29,356,005 s/min), 1,234,269,055 distinct states found (1,965,615 ds/min), 190,175,215 states left on queue.
+Progress(42) at 2024-11-06 23:54:01: 14,735,226,809 states generated (29,137,326 s/min), 1,236,451,189 distinct states found (2,182,134 ds/min), 190,293,853 states left on queue.
+Progress(42) at 2024-11-06 23:55:01: 14,764,611,146 states generated (29,384,337 s/min), 1,238,780,557 distinct states found (2,329,368 ds/min), 190,528,991 states left on queue.
+Progress(42) at 2024-11-06 23:56:01: 14,793,911,038 states generated (29,299,892 s/min), 1,240,745,156 distinct states found (1,964,599 ds/min), 190,493,881 states left on queue.
+Progress(42) at 2024-11-06 23:57:01: 14,823,113,635 states generated (29,202,597 s/min), 1,242,984,781 distinct states found (2,239,625 ds/min), 190,675,723 states left on queue.
+Progress(42) at 2024-11-06 23:58:01: 14,852,208,056 states generated (29,094,421 s/min), 1,245,341,804 distinct states found (2,357,023 ds/min), 190,959,027 states left on queue.
+Progress(42) at 2024-11-06 23:59:01: 14,881,390,523 states generated (29,182,467 s/min), 1,247,530,823 distinct states found (2,189,019 ds/min), 191,085,175 states left on queue.
+Progress(42) at 2024-11-07 00:00:01: 14,910,709,837 states generated (29,319,314 s/min), 1,249,665,632 distinct states found (2,134,809 ds/min), 191,148,911 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:01:02)
+Progress(42) at 2024-11-07 00:01:02: 14,940,301,722 states generated (29,591,885 s/min), 1,251,820,098 distinct states found (2,154,466 ds/min), 191,164,099 states left on queue.
+Progress(42) at 2024-11-07 00:02:02: 14,969,468,946 states generated (29,167,224 s/min), 1,253,608,374 distinct states found (1,788,276 ds/min), 190,977,899 states left on queue.
+Progress(42) at 2024-11-07 00:03:02: 14,998,469,861 states generated (29,000,915 s/min), 1,255,846,206 distinct states found (2,237,832 ds/min), 191,179,932 states left on queue.
+Progress(42) at 2024-11-07 00:04:02: 15,027,424,344 states generated (28,954,483 s/min), 1,258,012,253 distinct states found (2,166,047 ds/min), 191,269,006 states left on queue.
+Progress(42) at 2024-11-07 00:05:02: 15,056,595,053 states generated (29,170,709 s/min), 1,259,974,817 distinct states found (1,962,564 ds/min), 191,232,379 states left on queue.
+Progress(42) at 2024-11-07 00:06:02: 15,085,857,792 states generated (29,262,739 s/min), 1,262,139,752 distinct states found (2,164,935 ds/min), 191,351,326 states left on queue.
+Progress(42) at 2024-11-07 00:07:02: 15,115,386,019 states generated (29,528,227 s/min), 1,264,425,723 distinct states found (2,285,971 ds/min), 191,549,077 states left on queue.
+Progress(42) at 2024-11-07 00:08:02: 15,144,705,784 states generated (29,319,765 s/min), 1,266,390,816 distinct states found (1,965,093 ds/min), 191,495,454 states left on queue.
+Progress(42) at 2024-11-07 00:09:02: 15,173,877,454 states generated (29,171,670 s/min), 1,268,144,487 distinct states found (1,753,671 ds/min), 191,300,959 states left on queue.
+Progress(42) at 2024-11-07 00:10:02: 15,203,080,845 states generated (29,203,391 s/min), 1,270,256,870 distinct states found (2,112,383 ds/min), 191,363,085 states left on queue.
+Progress(42) at 2024-11-07 00:11:02: 15,232,426,418 states generated (29,345,573 s/min), 1,272,624,413 distinct states found (2,367,543 ds/min), 191,673,032 states left on queue.
+Progress(42) at 2024-11-07 00:12:02: 15,261,677,209 states generated (29,250,791 s/min), 1,274,995,857 distinct states found (2,371,444 ds/min), 191,960,618 states left on queue.
+Progress(42) at 2024-11-07 00:13:02: 15,290,882,314 states generated (29,205,105 s/min), 1,277,269,501 distinct states found (2,273,644 ds/min), 192,155,220 states left on queue.
+Progress(42) at 2024-11-07 00:14:02: 15,320,166,816 states generated (29,284,502 s/min), 1,279,524,897 distinct states found (2,255,396 ds/min), 192,367,797 states left on queue.
+Progress(42) at 2024-11-07 00:15:02: 15,349,391,017 states generated (29,224,201 s/min), 1,281,912,896 distinct states found (2,387,999 ds/min), 192,657,361 states left on queue.
+Progress(42) at 2024-11-07 00:16:02: 15,378,510,873 states generated (29,119,856 s/min), 1,284,352,819 distinct states found (2,439,923 ds/min), 192,982,001 states left on queue.
+Progress(42) at 2024-11-07 00:17:02: 15,407,729,690 states generated (29,218,817 s/min), 1,286,798,116 distinct states found (2,445,297 ds/min), 193,251,888 states left on queue.
+Progress(42) at 2024-11-07 00:18:02: 15,437,122,682 states generated (29,392,992 s/min), 1,289,060,398 distinct states found (2,262,282 ds/min), 193,393,686 states left on queue.
+Progress(42) at 2024-11-07 00:19:02: 15,466,437,919 states generated (29,315,237 s/min), 1,291,390,007 distinct states found (2,329,609 ds/min), 193,674,611 states left on queue.
+Progress(42) at 2024-11-07 00:20:02: 15,495,795,434 states generated (29,357,515 s/min), 1,293,625,999 distinct states found (2,235,992 ds/min), 193,855,148 states left on queue.
+Progress(42) at 2024-11-07 00:21:02: 15,524,856,146 states generated (29,060,712 s/min), 1,295,675,220 distinct states found (2,049,221 ds/min), 193,858,347 states left on queue.
+Progress(42) at 2024-11-07 00:22:02: 15,553,951,279 states generated (29,095,133 s/min), 1,297,806,219 distinct states found (2,130,999 ds/min), 193,910,330 states left on queue.
+Progress(42) at 2024-11-07 00:23:02: 15,582,781,229 states generated (28,829,950 s/min), 1,300,215,254 distinct states found (2,409,035 ds/min), 194,211,020 states left on queue.
+Progress(42) at 2024-11-07 00:24:02: 15,611,889,872 states generated (29,108,643 s/min), 1,302,431,347 distinct states found (2,216,093 ds/min), 194,324,070 states left on queue.
+Progress(42) at 2024-11-07 00:25:02: 15,640,778,210 states generated (28,888,338 s/min), 1,304,674,839 distinct states found (2,243,492 ds/min), 194,483,563 states left on queue.
+Progress(42) at 2024-11-07 00:26:02: 15,669,830,004 states generated (29,051,794 s/min), 1,306,661,103 distinct states found (1,986,264 ds/min), 194,429,101 states left on queue.
+Progress(42) at 2024-11-07 00:27:02: 15,699,049,213 states generated (29,219,209 s/min), 1,308,920,712 distinct states found (2,259,609 ds/min), 194,577,576 states left on queue.
+Progress(42) at 2024-11-07 00:28:02: 15,728,283,982 states generated (29,234,769 s/min), 1,310,924,780 distinct states found (2,004,068 ds/min), 194,488,601 states left on queue.
+Progress(42) at 2024-11-07 00:29:02: 15,757,507,793 states generated (29,223,811 s/min), 1,312,729,390 distinct states found (1,804,610 ds/min), 194,321,454 states left on queue.
+Progress(42) at 2024-11-07 00:30:02: 15,786,513,733 states generated (29,005,940 s/min), 1,314,926,573 distinct states found (2,197,183 ds/min), 194,422,995 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:31:03)
+Progress(42) at 2024-11-07 00:31:03: 15,815,683,048 states generated (29,169,315 s/min), 1,317,135,461 distinct states found (2,208,888 ds/min), 194,492,192 states left on queue.
+Progress(42) at 2024-11-07 00:32:03: 15,844,758,678 states generated (29,075,630 s/min), 1,319,144,875 distinct states found (2,009,414 ds/min), 194,413,387 states left on queue.
+Progress(42) at 2024-11-07 00:33:03: 15,873,998,157 states generated (29,239,479 s/min), 1,320,932,025 distinct states found (1,787,150 ds/min), 194,281,981 states left on queue.
+Progress(42) at 2024-11-07 00:34:03: 15,903,205,479 states generated (29,207,322 s/min), 1,322,654,400 distinct states found (1,722,375 ds/min), 194,091,121 states left on queue.
+Progress(42) at 2024-11-07 00:35:03: 15,932,501,264 states generated (29,295,785 s/min), 1,324,682,430 distinct states found (2,028,030 ds/min), 194,137,494 states left on queue.
+Progress(42) at 2024-11-07 00:36:03: 15,961,589,919 states generated (29,088,655 s/min), 1,326,509,334 distinct states found (1,826,904 ds/min), 194,051,639 states left on queue.
+Progress(42) at 2024-11-07 00:37:03: 15,990,668,327 states generated (29,078,408 s/min), 1,328,357,672 distinct states found (1,848,338 ds/min), 193,989,585 states left on queue.
+Progress(42) at 2024-11-07 00:38:03: 16,019,782,313 states generated (29,113,986 s/min), 1,330,232,446 distinct states found (1,874,774 ds/min), 193,949,446 states left on queue.
+Progress(42) at 2024-11-07 00:39:03: 16,049,252,200 states generated (29,469,887 s/min), 1,331,987,412 distinct states found (1,754,966 ds/min), 193,747,896 states left on queue.
+Progress(42) at 2024-11-07 00:40:03: 16,078,692,514 states generated (29,440,314 s/min), 1,333,894,185 distinct states found (1,906,773 ds/min), 193,729,942 states left on queue.
+Progress(42) at 2024-11-07 00:41:03: 16,108,160,136 states generated (29,467,622 s/min), 1,336,102,661 distinct states found (2,208,476 ds/min), 193,914,624 states left on queue.
+Progress(42) at 2024-11-07 00:42:03: 16,137,813,382 states generated (29,653,246 s/min), 1,338,180,836 distinct states found (2,078,175 ds/min), 193,976,996 states left on queue.
+Progress(43) at 2024-11-07 00:43:03: 16,167,357,885 states generated (29,544,503 s/min), 1,339,957,139 distinct states found (1,776,303 ds/min), 193,787,392 states left on queue.
+Progress(43) at 2024-11-07 00:44:03: 16,196,650,450 states generated (29,292,565 s/min), 1,341,719,088 distinct states found (1,761,949 ds/min), 193,648,551 states left on queue.
+Progress(43) at 2024-11-07 00:45:03: 16,225,735,286 states generated (29,084,836 s/min), 1,343,468,127 distinct states found (1,749,039 ds/min), 193,497,590 states left on queue.
+Progress(43) at 2024-11-07 00:46:03: 16,254,805,612 states generated (29,070,326 s/min), 1,345,280,226 distinct states found (1,812,099 ds/min), 193,364,788 states left on queue.
+Progress(43) at 2024-11-07 00:47:03: 16,283,933,423 states generated (29,127,811 s/min), 1,347,294,879 distinct states found (2,014,653 ds/min), 193,397,713 states left on queue.
+Progress(43) at 2024-11-07 00:48:03: 16,312,911,730 states generated (28,978,307 s/min), 1,349,192,377 distinct states found (1,897,498 ds/min), 193,321,503 states left on queue.
+Progress(43) at 2024-11-07 00:49:03: 16,342,115,657 states generated (29,203,927 s/min), 1,350,961,684 distinct states found (1,769,307 ds/min), 193,144,596 states left on queue.
+Progress(43) at 2024-11-07 00:50:03: 16,370,988,391 states generated (28,872,734 s/min), 1,352,868,904 distinct states found (1,907,220 ds/min), 193,089,969 states left on queue.
+Progress(43) at 2024-11-07 00:51:03: 16,400,089,208 states generated (29,100,817 s/min), 1,354,864,448 distinct states found (1,995,544 ds/min), 193,098,377 states left on queue.
+Progress(43) at 2024-11-07 00:52:03: 16,429,331,456 states generated (29,242,248 s/min), 1,356,734,632 distinct states found (1,870,184 ds/min), 193,093,615 states left on queue.
+Progress(43) at 2024-11-07 00:53:03: 16,458,648,761 states generated (29,317,305 s/min), 1,358,622,917 distinct states found (1,888,285 ds/min), 193,098,172 states left on queue.
+Progress(43) at 2024-11-07 00:54:03: 16,487,874,773 states generated (29,226,012 s/min), 1,360,737,908 distinct states found (2,114,991 ds/min), 193,250,949 states left on queue.
+Progress(43) at 2024-11-07 00:55:03: 16,517,101,401 states generated (29,226,628 s/min), 1,363,024,072 distinct states found (2,286,164 ds/min), 193,508,719 states left on queue.
+Progress(43) at 2024-11-07 00:56:03: 16,546,231,362 states generated (29,129,961 s/min), 1,365,056,771 distinct states found (2,032,699 ds/min), 193,558,441 states left on queue.
+Progress(43) at 2024-11-07 00:57:03: 16,575,532,837 states generated (29,301,475 s/min), 1,367,107,709 distinct states found (2,050,938 ds/min), 193,609,354 states left on queue.
+Progress(43) at 2024-11-07 00:58:03: 16,604,872,137 states generated (29,339,300 s/min), 1,369,059,417 distinct states found (1,951,708 ds/min), 193,561,420 states left on queue.
+Progress(43) at 2024-11-07 00:59:03: 16,634,070,732 states generated (29,198,595 s/min), 1,371,016,928 distinct states found (1,957,511 ds/min), 193,513,278 states left on queue.
+Progress(43) at 2024-11-07 01:00:03: 16,663,158,113 states generated (29,087,381 s/min), 1,373,092,542 distinct states found (2,075,614 ds/min), 193,582,661 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:01:03)
+Progress(43) at 2024-11-07 01:01:03: 16,692,576,110 states generated (29,417,997 s/min), 1,375,200,108 distinct states found (2,107,566 ds/min), 193,664,621 states left on queue.
+Progress(43) at 2024-11-07 01:02:03: 16,721,716,479 states generated (29,140,369 s/min), 1,377,247,529 distinct states found (2,047,421 ds/min), 193,708,538 states left on queue.
+Progress(43) at 2024-11-07 01:03:03: 16,750,779,523 states generated (29,063,044 s/min), 1,379,368,087 distinct states found (2,120,558 ds/min), 193,813,065 states left on queue.
+Progress(43) at 2024-11-07 01:04:03: 16,779,794,524 states generated (29,015,001 s/min), 1,381,371,287 distinct states found (2,003,200 ds/min), 193,825,465 states left on queue.
+Progress(43) at 2024-11-07 01:05:03: 16,808,907,203 states generated (29,112,679 s/min), 1,383,515,008 distinct states found (2,143,721 ds/min), 193,953,821 states left on queue.
+Progress(43) at 2024-11-07 01:06:03: 16,838,029,628 states generated (29,122,425 s/min), 1,385,629,882 distinct states found (2,114,874 ds/min), 194,038,163 states left on queue.
+Progress(43) at 2024-11-07 01:07:03: 16,867,418,111 states generated (29,388,483 s/min), 1,387,561,049 distinct states found (1,931,167 ds/min), 194,004,058 states left on queue.
+Progress(43) at 2024-11-07 01:08:03: 16,896,555,416 states generated (29,137,305 s/min), 1,389,592,238 distinct states found (2,031,189 ds/min), 194,058,208 states left on queue.
+Progress(43) at 2024-11-07 01:09:03: 16,925,642,685 states generated (29,087,269 s/min), 1,391,404,896 distinct states found (1,812,658 ds/min), 193,924,951 states left on queue.
+Progress(43) at 2024-11-07 01:10:03: 16,954,638,533 states generated (28,995,848 s/min), 1,393,186,525 distinct states found (1,781,629 ds/min), 193,784,358 states left on queue.
+Progress(43) at 2024-11-07 01:11:03: 16,983,710,894 states generated (29,072,361 s/min), 1,395,018,264 distinct states found (1,831,739 ds/min), 193,697,690 states left on queue.
+Progress(43) at 2024-11-07 01:12:03: 17,012,741,316 states generated (29,030,422 s/min), 1,397,039,325 distinct states found (2,021,061 ds/min), 193,755,919 states left on queue.
+Progress(43) at 2024-11-07 01:13:03: 17,041,674,538 states generated (28,933,222 s/min), 1,399,086,352 distinct states found (2,047,027 ds/min), 193,799,420 states left on queue.
+Progress(43) at 2024-11-07 01:14:03: 17,070,653,912 states generated (28,979,374 s/min), 1,401,092,312 distinct states found (2,005,960 ds/min), 193,820,018 states left on queue.
+Progress(43) at 2024-11-07 01:15:03: 17,099,536,446 states generated (28,882,534 s/min), 1,403,159,743 distinct states found (2,067,431 ds/min), 193,867,947 states left on queue.
+Progress(43) at 2024-11-07 01:16:03: 17,128,396,670 states generated (28,860,224 s/min), 1,405,244,280 distinct states found (2,084,537 ds/min), 193,945,380 states left on queue.
+Progress(43) at 2024-11-07 01:17:03: 17,157,276,177 states generated (28,879,507 s/min), 1,407,274,748 distinct states found (2,030,468 ds/min), 193,944,077 states left on queue.
+Progress(43) at 2024-11-07 01:18:03: 17,186,149,639 states generated (28,873,462 s/min), 1,409,283,088 distinct states found (2,008,340 ds/min), 193,881,792 states left on queue.
+Progress(43) at 2024-11-07 01:19:03: 17,214,923,206 states generated (28,773,567 s/min), 1,411,167,065 distinct states found (1,883,977 ds/min), 193,711,394 states left on queue.
+Progress(43) at 2024-11-07 01:20:03: 17,243,730,245 states generated (28,807,039 s/min), 1,413,023,763 distinct states found (1,856,698 ds/min), 193,546,054 states left on queue.
+Progress(43) at 2024-11-07 01:21:03: 17,272,650,525 states generated (28,920,280 s/min), 1,414,802,171 distinct states found (1,778,408 ds/min), 193,345,308 states left on queue.
+Progress(43) at 2024-11-07 01:22:03: 17,301,943,589 states generated (29,293,064 s/min), 1,416,599,440 distinct states found (1,797,269 ds/min), 193,158,676 states left on queue.
+Progress(43) at 2024-11-07 01:23:03: 17,331,337,313 states generated (29,393,724 s/min), 1,418,547,450 distinct states found (1,948,010 ds/min), 193,112,883 states left on queue.
+Progress(43) at 2024-11-07 01:24:03: 17,360,793,100 states generated (29,455,787 s/min), 1,420,576,018 distinct states found (2,028,568 ds/min), 193,100,476 states left on queue.
+Progress(43) at 2024-11-07 01:25:03: 17,390,123,392 states generated (29,330,292 s/min), 1,422,693,479 distinct states found (2,117,461 ds/min), 193,171,748 states left on queue.
+Progress(43) at 2024-11-07 01:26:03: 17,419,468,515 states generated (29,345,123 s/min), 1,424,783,244 distinct states found (2,089,765 ds/min), 193,228,274 states left on queue.
+Progress(43) at 2024-11-07 01:27:03: 17,448,810,016 states generated (29,341,501 s/min), 1,426,560,811 distinct states found (1,777,567 ds/min), 193,036,459 states left on queue.
+Progress(43) at 2024-11-07 01:28:03: 17,478,034,472 states generated (29,224,456 s/min), 1,428,663,374 distinct states found (2,102,563 ds/min), 193,125,616 states left on queue.
+Progress(43) at 2024-11-07 01:29:03: 17,507,201,835 states generated (29,167,363 s/min), 1,430,735,910 distinct states found (2,072,536 ds/min), 193,146,850 states left on queue.
+Progress(43) at 2024-11-07 01:30:03: 17,536,546,498 states generated (29,344,663 s/min), 1,432,877,950 distinct states found (2,142,040 ds/min), 193,230,645 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:31:04)
+Progress(43) at 2024-11-07 01:31:04: 17,566,061,546 states generated (29,515,048 s/min), 1,434,839,708 distinct states found (1,961,758 ds/min), 193,176,951 states left on queue.
+Progress(43) at 2024-11-07 01:32:04: 17,595,015,993 states generated (28,954,447 s/min), 1,436,986,257 distinct states found (2,146,549 ds/min), 193,289,254 states left on queue.
+Progress(43) at 2024-11-07 01:33:04: 17,624,137,153 states generated (29,121,160 s/min), 1,439,279,150 distinct states found (2,292,893 ds/min), 193,525,973 states left on queue.
+Progress(43) at 2024-11-07 01:34:04: 17,653,328,248 states generated (29,191,095 s/min), 1,441,299,767 distinct states found (2,020,617 ds/min), 193,504,947 states left on queue.
+Progress(43) at 2024-11-07 01:35:04: 17,682,562,562 states generated (29,234,314 s/min), 1,443,317,413 distinct states found (2,017,646 ds/min), 193,471,905 states left on queue.
+Progress(43) at 2024-11-07 01:36:04: 17,711,829,397 states generated (29,266,835 s/min), 1,445,304,310 distinct states found (1,986,897 ds/min), 193,370,899 states left on queue.
+Progress(43) at 2024-11-07 01:37:04: 17,740,910,347 states generated (29,080,950 s/min), 1,447,009,563 distinct states found (1,705,253 ds/min), 193,129,235 states left on queue.
+Progress(43) at 2024-11-07 01:38:04: 17,769,836,321 states generated (28,925,974 s/min), 1,449,139,496 distinct states found (2,129,933 ds/min), 193,234,511 states left on queue.
+Progress(43) at 2024-11-07 01:39:04: 17,798,713,067 states generated (28,876,746 s/min), 1,451,211,612 distinct states found (2,072,116 ds/min), 193,241,362 states left on queue.
+Progress(43) at 2024-11-07 01:40:04: 17,827,794,691 states generated (29,081,624 s/min), 1,453,062,046 distinct states found (1,850,434 ds/min), 193,114,753 states left on queue.
+Progress(43) at 2024-11-07 01:41:04: 17,856,974,014 states generated (29,179,323 s/min), 1,455,151,187 distinct states found (2,089,141 ds/min), 193,169,579 states left on queue.
+Progress(43) at 2024-11-07 01:42:04: 17,886,446,666 states generated (29,472,652 s/min), 1,457,303,171 distinct states found (2,151,984 ds/min), 193,263,708 states left on queue.
+Progress(43) at 2024-11-07 01:43:04: 17,915,744,840 states generated (29,298,174 s/min), 1,459,261,460 distinct states found (1,958,289 ds/min), 193,194,468 states left on queue.
+Progress(43) at 2024-11-07 01:44:04: 17,944,793,057 states generated (29,048,217 s/min), 1,460,885,305 distinct states found (1,623,845 ds/min), 192,905,330 states left on queue.
+Progress(43) at 2024-11-07 01:45:04: 17,973,952,967 states generated (29,159,910 s/min), 1,462,880,642 distinct states found (1,995,337 ds/min), 192,871,348 states left on queue.
+Progress(43) at 2024-11-07 01:46:04: 18,003,158,344 states generated (29,205,377 s/min), 1,465,077,846 distinct states found (2,197,204 ds/min), 193,039,702 states left on queue.
+Progress(43) at 2024-11-07 01:47:04: 18,032,464,087 states generated (29,305,743 s/min), 1,467,361,120 distinct states found (2,283,274 ds/min), 193,271,051 states left on queue.
+Progress(43) at 2024-11-07 01:48:04: 18,061,597,682 states generated (29,133,595 s/min), 1,469,505,688 distinct states found (2,144,568 ds/min), 193,354,360 states left on queue.
+Progress(43) at 2024-11-07 01:49:04: 18,090,888,515 states generated (29,290,833 s/min), 1,471,655,035 distinct states found (2,149,347 ds/min), 193,472,080 states left on queue.
+Progress(43) at 2024-11-07 01:50:04: 18,119,855,749 states generated (28,967,234 s/min), 1,473,959,147 distinct states found (2,304,112 ds/min), 193,714,821 states left on queue.
+Progress(43) at 2024-11-07 01:51:04: 18,149,035,954 states generated (29,180,205 s/min), 1,476,253,894 distinct states found (2,294,747 ds/min), 193,939,051 states left on queue.
+Progress(43) at 2024-11-07 01:52:04: 18,178,210,402 states generated (29,174,448 s/min), 1,478,557,699 distinct states found (2,303,805 ds/min), 194,141,809 states left on queue.
+Progress(43) at 2024-11-07 01:53:04: 18,207,377,534 states generated (29,167,132 s/min), 1,480,870,404 distinct states found (2,312,705 ds/min), 194,307,877 states left on queue.
+Progress(43) at 2024-11-07 01:54:04: 18,236,577,989 states generated (29,200,455 s/min), 1,483,070,823 distinct states found (2,200,419 ds/min), 194,387,223 states left on queue.
+Progress(43) at 2024-11-07 01:55:04: 18,265,859,163 states generated (29,281,174 s/min), 1,485,222,154 distinct states found (2,151,331 ds/min), 194,522,233 states left on queue.
+Progress(43) at 2024-11-07 01:56:04: 18,295,148,797 states generated (29,289,634 s/min), 1,487,521,283 distinct states found (2,299,129 ds/min), 194,755,427 states left on queue.
+Progress(43) at 2024-11-07 01:57:04: 18,324,289,175 states generated (29,140,378 s/min), 1,489,367,193 distinct states found (1,845,910 ds/min), 194,604,366 states left on queue.
+Progress(43) at 2024-11-07 01:58:04: 18,353,385,770 states generated (29,096,595 s/min), 1,491,503,782 distinct states found (2,136,589 ds/min), 194,651,670 states left on queue.
+Progress(43) at 2024-11-07 01:59:04: 18,382,277,307 states generated (28,891,537 s/min), 1,493,659,362 distinct states found (2,155,580 ds/min), 194,761,640 states left on queue.
+Progress(43) at 2024-11-07 02:00:04: 18,411,146,853 states generated (28,869,546 s/min), 1,495,935,237 distinct states found (2,275,875 ds/min), 194,908,896 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:01:05)
+Progress(43) at 2024-11-07 02:01:05: 18,440,532,837 states generated (29,385,984 s/min), 1,497,937,249 distinct states found (2,002,012 ds/min), 194,874,096 states left on queue.
+Progress(43) at 2024-11-07 02:02:05: 18,469,385,511 states generated (28,852,674 s/min), 1,500,062,084 distinct states found (2,124,835 ds/min), 194,900,930 states left on queue.
+Progress(43) at 2024-11-07 02:03:05: 18,498,509,160 states generated (29,123,649 s/min), 1,502,077,387 distinct states found (2,015,303 ds/min), 194,871,250 states left on queue.
+Progress(43) at 2024-11-07 02:04:05: 18,527,694,520 states generated (29,185,360 s/min), 1,504,242,136 distinct states found (2,164,749 ds/min), 194,924,748 states left on queue.
+Progress(43) at 2024-11-07 02:05:05: 18,556,901,350 states generated (29,206,830 s/min), 1,506,034,687 distinct states found (1,792,551 ds/min), 194,670,609 states left on queue.
+Progress(43) at 2024-11-07 02:06:05: 18,586,004,706 states generated (29,103,356 s/min), 1,507,879,191 distinct states found (1,844,504 ds/min), 194,551,782 states left on queue.
+Progress(43) at 2024-11-07 02:07:05: 18,614,881,319 states generated (28,876,613 s/min), 1,510,019,997 distinct states found (2,140,806 ds/min), 194,594,957 states left on queue.
+Progress(43) at 2024-11-07 02:08:05: 18,643,854,322 states generated (28,973,003 s/min), 1,512,074,165 distinct states found (2,054,168 ds/min), 194,532,832 states left on queue.
+Progress(43) at 2024-11-07 02:09:05: 18,672,998,550 states generated (29,144,228 s/min), 1,513,943,120 distinct states found (1,868,955 ds/min), 194,368,599 states left on queue.
+Progress(43) at 2024-11-07 02:10:05: 18,702,201,308 states generated (29,202,758 s/min), 1,515,546,068 distinct states found (1,602,948 ds/min), 194,090,755 states left on queue.
+Progress(43) at 2024-11-07 02:11:05: 18,731,481,011 states generated (29,279,703 s/min), 1,517,343,788 distinct states found (1,797,720 ds/min), 193,942,961 states left on queue.
+Progress(43) at 2024-11-07 02:12:05: 18,760,609,986 states generated (29,128,975 s/min), 1,519,160,050 distinct states found (1,816,262 ds/min), 193,815,502 states left on queue.
+Progress(43) at 2024-11-07 02:13:05: 18,789,628,202 states generated (29,018,216 s/min), 1,520,860,123 distinct states found (1,700,073 ds/min), 193,642,399 states left on queue.
+Progress(43) at 2024-11-07 02:14:05: 18,818,770,407 states generated (29,142,205 s/min), 1,522,616,126 distinct states found (1,756,003 ds/min), 193,516,180 states left on queue.
+Progress(43) at 2024-11-07 02:15:05: 18,847,943,521 states generated (29,173,114 s/min), 1,524,373,878 distinct states found (1,757,752 ds/min), 193,352,389 states left on queue.
+Progress(43) at 2024-11-07 02:16:05: 18,877,338,814 states generated (29,395,293 s/min), 1,526,022,199 distinct states found (1,648,321 ds/min), 193,099,089 states left on queue.
+Progress(43) at 2024-11-07 02:17:05: 18,906,854,907 states generated (29,516,093 s/min), 1,528,057,287 distinct states found (2,035,088 ds/min), 193,164,007 states left on queue.
+Progress(43) at 2024-11-07 02:18:05: 18,936,272,714 states generated (29,417,807 s/min), 1,530,070,868 distinct states found (2,013,581 ds/min), 193,195,191 states left on queue.
+Progress(43) at 2024-11-07 02:19:05: 18,965,845,291 states generated (29,572,577 s/min), 1,531,953,514 distinct states found (1,882,646 ds/min), 193,094,610 states left on queue.
+Progress(44) at 2024-11-07 02:20:05: 18,995,225,711 states generated (29,380,420 s/min), 1,533,586,486 distinct states found (1,632,972 ds/min), 192,813,292 states left on queue.
+Progress(44) at 2024-11-07 02:21:05: 19,024,424,249 states generated (29,198,538 s/min), 1,535,341,846 distinct states found (1,755,360 ds/min), 192,665,431 states left on queue.
+Progress(44) at 2024-11-07 02:22:05: 19,053,319,611 states generated (28,895,362 s/min), 1,536,913,652 distinct states found (1,571,806 ds/min), 192,336,687 states left on queue.
+Progress(44) at 2024-11-07 02:23:05: 19,082,456,366 states generated (29,136,755 s/min), 1,538,781,638 distinct states found (1,867,986 ds/min), 192,258,068 states left on queue.
+Progress(44) at 2024-11-07 02:24:05: 19,111,445,941 states generated (28,989,575 s/min), 1,540,696,734 distinct states found (1,915,096 ds/min), 192,193,602 states left on queue.
+Progress(44) at 2024-11-07 02:25:05: 19,140,498,683 states generated (29,052,742 s/min), 1,542,368,994 distinct states found (1,672,260 ds/min), 191,938,239 states left on queue.
+Progress(44) at 2024-11-07 02:26:05: 19,169,386,645 states generated (28,887,962 s/min), 1,544,099,236 distinct states found (1,730,242 ds/min), 191,741,059 states left on queue.
+Progress(44) at 2024-11-07 02:27:05: 19,198,354,957 states generated (28,968,312 s/min), 1,545,891,836 distinct states found (1,792,600 ds/min), 191,577,211 states left on queue.
+Progress(44) at 2024-11-07 02:28:05: 19,227,551,398 states generated (29,196,441 s/min), 1,547,751,807 distinct states found (1,859,971 ds/min), 191,530,291 states left on queue.
+Progress(44) at 2024-11-07 02:29:05: 19,256,905,544 states generated (29,354,146 s/min), 1,549,562,753 distinct states found (1,810,946 ds/min), 191,492,536 states left on queue.
+Progress(44) at 2024-11-07 02:30:05: 19,286,043,009 states generated (29,137,465 s/min), 1,551,387,062 distinct states found (1,824,309 ds/min), 191,432,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:31:05)
+Progress(44) at 2024-11-07 02:31:05: 19,315,478,636 states generated (29,435,627 s/min), 1,553,684,416 distinct states found (2,297,354 ds/min), 191,685,387 states left on queue.
+Progress(44) at 2024-11-07 02:32:05: 19,344,574,433 states generated (29,095,797 s/min), 1,555,642,251 distinct states found (1,957,835 ds/min), 191,687,380 states left on queue.
+Progress(44) at 2024-11-07 02:33:05: 19,373,560,321 states generated (28,985,888 s/min), 1,557,576,032 distinct states found (1,933,781 ds/min), 191,644,771 states left on queue.
+Progress(44) at 2024-11-07 02:34:05: 19,402,882,849 states generated (29,322,528 s/min), 1,559,483,211 distinct states found (1,907,179 ds/min), 191,584,351 states left on queue.
+Progress(44) at 2024-11-07 02:35:05: 19,432,084,827 states generated (29,201,978 s/min), 1,561,305,888 distinct states found (1,822,677 ds/min), 191,432,596 states left on queue.
+Progress(44) at 2024-11-07 02:36:05: 19,461,112,335 states generated (29,027,508 s/min), 1,563,180,797 distinct states found (1,874,909 ds/min), 191,330,553 states left on queue.
+Progress(44) at 2024-11-07 02:37:05: 19,490,043,498 states generated (28,931,163 s/min), 1,565,137,368 distinct states found (1,956,571 ds/min), 191,292,333 states left on queue.
+Progress(44) at 2024-11-07 02:38:05: 19,519,153,014 states generated (29,109,516 s/min), 1,567,034,954 distinct states found (1,897,586 ds/min), 191,229,524 states left on queue.
+Progress(44) at 2024-11-07 02:39:05: 19,548,204,678 states generated (29,051,664 s/min), 1,568,989,443 distinct states found (1,954,489 ds/min), 191,191,752 states left on queue.
+Progress(44) at 2024-11-07 02:40:05: 19,577,227,470 states generated (29,022,792 s/min), 1,570,981,495 distinct states found (1,992,052 ds/min), 191,200,024 states left on queue.
+Progress(44) at 2024-11-07 02:41:05: 19,606,172,601 states generated (28,945,131 s/min), 1,572,870,324 distinct states found (1,888,829 ds/min), 191,115,956 states left on queue.
+Progress(44) at 2024-11-07 02:42:05: 19,635,167,481 states generated (28,994,880 s/min), 1,574,894,468 distinct states found (2,024,144 ds/min), 191,139,869 states left on queue.
+Progress(44) at 2024-11-07 02:43:05: 19,664,339,049 states generated (29,171,568 s/min), 1,576,906,348 distinct states found (2,011,880 ds/min), 191,137,521 states left on queue.
+Progress(44) at 2024-11-07 02:44:05: 19,693,639,689 states generated (29,300,640 s/min), 1,578,748,425 distinct states found (1,842,077 ds/min), 191,040,518 states left on queue.
+Progress(44) at 2024-11-07 02:45:05: 19,722,704,536 states generated (29,064,847 s/min), 1,580,671,538 distinct states found (1,923,113 ds/min), 191,001,469 states left on queue.
+Progress(44) at 2024-11-07 02:46:05: 19,751,627,669 states generated (28,923,133 s/min), 1,582,340,762 distinct states found (1,669,224 ds/min), 190,750,504 states left on queue.
+Progress(44) at 2024-11-07 02:47:05: 19,780,532,535 states generated (28,904,866 s/min), 1,583,965,049 distinct states found (1,624,287 ds/min), 190,492,540 states left on queue.
+Progress(44) at 2024-11-07 02:48:05: 19,809,548,743 states generated (29,016,208 s/min), 1,585,820,774 distinct states found (1,855,725 ds/min), 190,422,454 states left on queue.
+Progress(44) at 2024-11-07 02:49:05: 19,838,541,075 states generated (28,992,332 s/min), 1,587,731,649 distinct states found (1,910,875 ds/min), 190,386,932 states left on queue.
+Progress(44) at 2024-11-07 02:50:05: 19,867,458,320 states generated (28,917,245 s/min), 1,589,622,141 distinct states found (1,890,492 ds/min), 190,310,460 states left on queue.
+Progress(44) at 2024-11-07 02:51:05: 19,896,287,158 states generated (28,828,838 s/min), 1,591,517,151 distinct states found (1,895,010 ds/min), 190,235,561 states left on queue.
+Progress(44) at 2024-11-07 02:52:05: 19,925,117,820 states generated (28,830,662 s/min), 1,593,453,289 distinct states found (1,936,138 ds/min), 190,176,789 states left on queue.
+Progress(44) at 2024-11-07 02:53:05: 19,953,949,651 states generated (28,831,831 s/min), 1,595,392,832 distinct states found (1,939,543 ds/min), 190,137,713 states left on queue.
+Progress(44) at 2024-11-07 02:54:05: 19,982,791,590 states generated (28,841,939 s/min), 1,597,295,182 distinct states found (1,902,350 ds/min), 190,030,864 states left on queue.
+Progress(44) at 2024-11-07 02:55:05: 20,011,631,796 states generated (28,840,206 s/min), 1,599,162,388 distinct states found (1,867,206 ds/min), 189,857,155 states left on queue.
+Progress(44) at 2024-11-07 02:56:05: 20,040,350,017 states generated (28,718,221 s/min), 1,600,882,747 distinct states found (1,720,359 ds/min), 189,556,504 states left on queue.
+Progress(44) at 2024-11-07 02:57:05: 20,069,048,267 states generated (28,698,250 s/min), 1,602,583,945 distinct states found (1,701,198 ds/min), 189,276,085 states left on queue.
+Progress(44) at 2024-11-07 02:58:05: 20,098,037,079 states generated (28,988,812 s/min), 1,604,245,937 distinct states found (1,661,992 ds/min), 188,968,070 states left on queue.
+Progress(44) at 2024-11-07 02:59:05: 20,127,216,730 states generated (29,179,651 s/min), 1,605,916,753 distinct states found (1,670,816 ds/min), 188,703,437 states left on queue.
+Progress(44) at 2024-11-07 03:00:05: 20,156,712,917 states generated (29,496,187 s/min), 1,607,868,866 distinct states found (1,952,113 ds/min), 188,640,553 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:01:06)
+Progress(44) at 2024-11-07 03:01:06: 20,186,396,044 states generated (29,683,127 s/min), 1,609,765,772 distinct states found (1,896,906 ds/min), 188,510,848 states left on queue.
+Progress(44) at 2024-11-07 03:02:06: 20,215,754,864 states generated (29,358,820 s/min), 1,611,924,139 distinct states found (2,158,367 ds/min), 188,607,723 states left on queue.
+Progress(44) at 2024-11-07 03:03:06: 20,245,041,982 states generated (29,287,118 s/min), 1,613,794,702 distinct states found (1,870,563 ds/min), 188,472,700 states left on queue.
+Progress(44) at 2024-11-07 03:04:06: 20,274,294,374 states generated (29,252,392 s/min), 1,615,566,733 distinct states found (1,772,031 ds/min), 188,311,061 states left on queue.
+Progress(44) at 2024-11-07 03:05:06: 20,303,317,537 states generated (29,023,163 s/min), 1,617,541,966 distinct states found (1,975,233 ds/min), 188,275,227 states left on queue.
+Progress(44) at 2024-11-07 03:06:06: 20,332,555,917 states generated (29,238,380 s/min), 1,619,626,477 distinct states found (2,084,511 ds/min), 188,311,129 states left on queue.
+Progress(44) at 2024-11-07 03:07:06: 20,361,814,948 states generated (29,259,031 s/min), 1,621,498,944 distinct states found (1,872,467 ds/min), 188,187,982 states left on queue.
+Progress(44) at 2024-11-07 03:08:06: 20,391,066,062 states generated (29,251,114 s/min), 1,623,499,145 distinct states found (2,000,201 ds/min), 188,184,372 states left on queue.
+Progress(44) at 2024-11-07 03:09:06: 20,420,013,539 states generated (28,947,477 s/min), 1,625,534,256 distinct states found (2,035,111 ds/min), 188,202,174 states left on queue.
+Progress(44) at 2024-11-07 03:10:06: 20,449,116,787 states generated (29,103,248 s/min), 1,627,670,135 distinct states found (2,135,879 ds/min), 188,303,061 states left on queue.
+Progress(44) at 2024-11-07 03:11:06: 20,478,265,224 states generated (29,148,437 s/min), 1,629,558,947 distinct states found (1,888,812 ds/min), 188,171,995 states left on queue.
+Progress(44) at 2024-11-07 03:12:06: 20,507,459,785 states generated (29,194,561 s/min), 1,631,460,915 distinct states found (1,901,968 ds/min), 188,044,516 states left on queue.
+Progress(44) at 2024-11-07 03:13:06: 20,536,655,025 states generated (29,195,240 s/min), 1,633,292,515 distinct states found (1,831,600 ds/min), 187,823,678 states left on queue.
+Progress(44) at 2024-11-07 03:14:06: 20,565,699,198 states generated (29,044,173 s/min), 1,634,967,122 distinct states found (1,674,607 ds/min), 187,564,357 states left on queue.
+Progress(44) at 2024-11-07 03:15:06: 20,594,568,781 states generated (28,869,583 s/min), 1,636,996,440 distinct states found (2,029,318 ds/min), 187,577,506 states left on queue.
+Progress(44) at 2024-11-07 03:16:06: 20,623,463,526 states generated (28,894,745 s/min), 1,638,870,718 distinct states found (1,874,278 ds/min), 187,429,057 states left on queue.
+Progress(44) at 2024-11-07 03:17:06: 20,652,517,975 states generated (29,054,449 s/min), 1,640,608,054 distinct states found (1,737,336 ds/min), 187,198,996 states left on queue.
+Progress(44) at 2024-11-07 03:18:06: 20,681,729,377 states generated (29,211,402 s/min), 1,642,682,611 distinct states found (2,074,557 ds/min), 187,238,673 states left on queue.
+Progress(44) at 2024-11-07 03:19:06: 20,711,226,363 states generated (29,496,986 s/min), 1,644,764,480 distinct states found (2,081,869 ds/min), 187,269,746 states left on queue.
+Progress(44) at 2024-11-07 03:20:06: 20,740,520,876 states generated (29,294,513 s/min), 1,646,565,948 distinct states found (1,801,468 ds/min), 187,085,841 states left on queue.
+Progress(44) at 2024-11-07 03:21:06: 20,769,532,066 states generated (29,011,190 s/min), 1,648,139,570 distinct states found (1,573,622 ds/min), 186,737,971 states left on queue.
+Progress(44) at 2024-11-07 03:22:06: 20,798,731,555 states generated (29,199,489 s/min), 1,650,061,318 distinct states found (1,921,748 ds/min), 186,652,080 states left on queue.
+Progress(44) at 2024-11-07 03:23:06: 20,827,864,871 states generated (29,133,316 s/min), 1,652,217,368 distinct states found (2,156,050 ds/min), 186,786,338 states left on queue.
+Progress(44) at 2024-11-07 03:24:06: 20,857,114,542 states generated (29,249,671 s/min), 1,654,404,059 distinct states found (2,186,691 ds/min), 186,937,127 states left on queue.
+Progress(44) at 2024-11-07 03:25:06: 20,886,216,235 states generated (29,101,693 s/min), 1,656,424,687 distinct states found (2,020,628 ds/min), 186,925,384 states left on queue.
+Progress(44) at 2024-11-07 03:26:06: 20,915,415,138 states generated (29,198,903 s/min), 1,658,503,968 distinct states found (2,079,281 ds/min), 186,988,200 states left on queue.
+Progress(44) at 2024-11-07 03:27:06: 20,944,436,117 states generated (29,020,979 s/min), 1,660,708,925 distinct states found (2,204,957 ds/min), 187,151,771 states left on queue.
+Progress(44) at 2024-11-07 03:28:06: 20,973,637,986 states generated (29,201,869 s/min), 1,662,812,161 distinct states found (2,103,236 ds/min), 187,208,363 states left on queue.
+Progress(44) at 2024-11-07 03:29:06: 21,002,664,654 states generated (29,026,668 s/min), 1,665,077,078 distinct states found (2,264,917 ds/min), 187,398,168 states left on queue.
+Progress(44) at 2024-11-07 03:30:06: 21,031,900,683 states generated (29,236,029 s/min), 1,667,241,517 distinct states found (2,164,439 ds/min), 187,444,342 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:31:07)
+Progress(44) at 2024-11-07 03:31:07: 21,061,190,967 states generated (29,290,284 s/min), 1,669,337,346 distinct states found (2,095,829 ds/min), 187,431,388 states left on queue.
+Progress(44) at 2024-11-07 03:32:07: 21,090,368,622 states generated (29,177,655 s/min), 1,671,292,120 distinct states found (1,954,774 ds/min), 187,370,395 states left on queue.
+Progress(44) at 2024-11-07 03:33:07: 21,119,546,588 states generated (29,177,966 s/min), 1,673,505,061 distinct states found (2,212,941 ds/min), 187,548,275 states left on queue.
+Progress(44) at 2024-11-07 03:34:07: 21,148,770,544 states generated (29,223,956 s/min), 1,675,679,331 distinct states found (2,174,270 ds/min), 187,681,477 states left on queue.
+Progress(44) at 2024-11-07 03:35:07: 21,177,752,842 states generated (28,982,298 s/min), 1,677,512,003 distinct states found (1,832,672 ds/min), 187,502,663 states left on queue.
+Progress(44) at 2024-11-07 03:36:07: 21,206,777,989 states generated (29,025,147 s/min), 1,679,391,120 distinct states found (1,879,117 ds/min), 187,345,876 states left on queue.
+Progress(44) at 2024-11-07 03:37:07: 21,235,634,562 states generated (28,856,573 s/min), 1,681,551,461 distinct states found (2,160,341 ds/min), 187,464,887 states left on queue.
+Progress(44) at 2024-11-07 03:38:07: 21,264,448,690 states generated (28,814,128 s/min), 1,683,690,836 distinct states found (2,139,375 ds/min), 187,491,922 states left on queue.
+Progress(44) at 2024-11-07 03:39:07: 21,293,469,454 states generated (29,020,764 s/min), 1,685,615,643 distinct states found (1,924,807 ds/min), 187,416,199 states left on queue.
+Progress(44) at 2024-11-07 03:40:07: 21,322,287,082 states generated (28,817,628 s/min), 1,687,574,723 distinct states found (1,959,080 ds/min), 187,310,981 states left on queue.
+Progress(44) at 2024-11-07 03:41:07: 21,351,396,680 states generated (29,109,598 s/min), 1,689,546,445 distinct states found (1,971,722 ds/min), 187,236,923 states left on queue.
+Progress(44) at 2024-11-07 03:42:07: 21,380,557,165 states generated (29,160,485 s/min), 1,691,587,169 distinct states found (2,040,724 ds/min), 187,186,480 states left on queue.
+Progress(44) at 2024-11-07 03:43:07: 21,409,627,333 states generated (29,070,168 s/min), 1,693,246,645 distinct states found (1,659,476 ds/min), 186,839,410 states left on queue.
+Progress(44) at 2024-11-07 03:44:07: 21,438,692,500 states generated (29,065,167 s/min), 1,695,162,088 distinct states found (1,915,443 ds/min), 186,763,843 states left on queue.
+Progress(44) at 2024-11-07 03:45:07: 21,467,558,980 states generated (28,866,480 s/min), 1,697,105,328 distinct states found (1,943,240 ds/min), 186,647,091 states left on queue.
+Progress(44) at 2024-11-07 03:46:07: 21,496,459,596 states generated (28,900,616 s/min), 1,698,987,134 distinct states found (1,881,806 ds/min), 186,428,411 states left on queue.
+Progress(44) at 2024-11-07 03:47:07: 21,525,539,564 states generated (29,079,968 s/min), 1,700,685,335 distinct states found (1,698,201 ds/min), 186,176,831 states left on queue.
+Progress(44) at 2024-11-07 03:48:07: 21,554,716,115 states generated (29,176,551 s/min), 1,702,193,633 distinct states found (1,508,298 ds/min), 185,811,852 states left on queue.
+Progress(44) at 2024-11-07 03:49:07: 21,583,930,332 states generated (29,214,217 s/min), 1,703,965,186 distinct states found (1,771,553 ds/min), 185,645,122 states left on queue.
+Progress(44) at 2024-11-07 03:50:07: 21,612,870,304 states generated (28,939,972 s/min), 1,705,581,017 distinct states found (1,615,831 ds/min), 185,385,482 states left on queue.
+Progress(44) at 2024-11-07 03:51:07: 21,641,828,993 states generated (28,958,689 s/min), 1,707,209,695 distinct states found (1,628,678 ds/min), 185,147,878 states left on queue.
+Progress(44) at 2024-11-07 03:52:07: 21,670,879,227 states generated (29,050,234 s/min), 1,708,891,056 distinct states found (1,681,361 ds/min), 184,967,950 states left on queue.
+Progress(44) at 2024-11-07 03:53:07: 21,700,175,853 states generated (29,296,626 s/min), 1,710,442,845 distinct states found (1,551,789 ds/min), 184,628,950 states left on queue.
+Progress(44) at 2024-11-07 03:54:07: 21,729,661,920 states generated (29,486,067 s/min), 1,712,360,375 distinct states found (1,917,530 ds/min), 184,602,047 states left on queue.
+Progress(44) at 2024-11-07 03:55:07: 21,759,015,470 states generated (29,353,550 s/min), 1,714,259,170 distinct states found (1,898,795 ds/min), 184,554,564 states left on queue.
+Progress(44) at 2024-11-07 03:56:07: 21,788,534,088 states generated (29,518,618 s/min), 1,716,081,999 distinct states found (1,822,829 ds/min), 184,406,994 states left on queue.
+Progress(44) at 2024-11-07 03:57:07: 21,817,875,474 states generated (29,341,386 s/min), 1,717,634,611 distinct states found (1,552,612 ds/min), 184,057,660 states left on queue.
+Progress(44) at 2024-11-07 03:58:07: 21,847,006,510 states generated (29,131,036 s/min), 1,719,299,741 distinct states found (1,665,130 ds/min), 183,828,258 states left on queue.
+Progress(44) at 2024-11-07 03:59:07: 21,875,869,357 states generated (28,862,847 s/min), 1,720,801,722 distinct states found (1,501,981 ds/min), 183,443,083 states left on queue.
+Progress(44) at 2024-11-07 04:00:07: 21,904,922,732 states generated (29,053,375 s/min), 1,722,588,504 distinct states found (1,786,782 ds/min), 183,289,094 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:01:07)
+Progress(44) at 2024-11-07 04:01:07: 21,933,965,695 states generated (29,042,963 s/min), 1,724,285,279 distinct states found (1,696,775 ds/min), 183,029,310 states left on queue.
+Progress(44) at 2024-11-07 04:02:07: 21,962,959,341 states generated (28,993,646 s/min), 1,725,868,213 distinct states found (1,582,934 ds/min), 182,699,155 states left on queue.
+Progress(44) at 2024-11-07 04:03:07: 21,991,777,816 states generated (28,818,475 s/min), 1,727,519,032 distinct states found (1,650,819 ds/min), 182,433,701 states left on queue.
+Progress(44) at 2024-11-07 04:04:07: 22,020,733,433 states generated (28,955,617 s/min), 1,729,219,503 distinct states found (1,700,471 ds/min), 182,216,615 states left on queue.
+Progress(44) at 2024-11-07 04:05:07: 22,049,984,634 states generated (29,251,201 s/min), 1,730,967,606 distinct states found (1,748,103 ds/min), 182,140,987 states left on queue.
+Progress(44) at 2024-11-07 04:06:07: 22,079,112,674 states generated (29,128,040 s/min), 1,732,648,368 distinct states found (1,680,762 ds/min), 181,963,576 states left on queue.
+Progress(44) at 2024-11-07 04:07:07: 22,108,329,917 states generated (29,217,243 s/min), 1,734,711,160 distinct states found (2,062,792 ds/min), 182,074,434 states left on queue.
+Progress(44) at 2024-11-07 04:08:07: 22,137,402,322 states generated (29,072,405 s/min), 1,736,773,111 distinct states found (2,061,951 ds/min), 182,163,318 states left on queue.
+Progress(44) at 2024-11-07 04:09:07: 22,166,402,243 states generated (28,999,921 s/min), 1,738,573,615 distinct states found (1,800,504 ds/min), 182,034,194 states left on queue.
+Progress(44) at 2024-11-07 04:10:07: 22,195,545,763 states generated (29,143,520 s/min), 1,740,349,901 distinct states found (1,776,286 ds/min), 181,869,339 states left on queue.
+Progress(44) at 2024-11-07 04:11:07: 22,224,766,309 states generated (29,220,546 s/min), 1,742,110,577 distinct states found (1,760,676 ds/min), 181,671,885 states left on queue.
+Progress(44) at 2024-11-07 04:12:07: 22,253,807,692 states generated (29,041,383 s/min), 1,743,796,752 distinct states found (1,686,175 ds/min), 181,407,584 states left on queue.
+Progress(44) at 2024-11-07 04:13:07: 22,282,790,947 states generated (28,983,255 s/min), 1,745,617,175 distinct states found (1,820,423 ds/min), 181,265,096 states left on queue.
+Progress(44) at 2024-11-07 04:14:07: 22,311,840,917 states generated (29,049,970 s/min), 1,747,424,658 distinct states found (1,807,483 ds/min), 181,110,335 states left on queue.
+Progress(44) at 2024-11-07 04:15:07: 22,340,851,116 states generated (29,010,199 s/min), 1,749,204,899 distinct states found (1,780,241 ds/min), 180,933,264 states left on queue.
+Progress(44) at 2024-11-07 04:16:07: 22,369,820,191 states generated (28,969,075 s/min), 1,751,058,290 distinct states found (1,853,391 ds/min), 180,819,450 states left on queue.
+Progress(44) at 2024-11-07 04:17:07: 22,398,637,854 states generated (28,817,663 s/min), 1,752,838,012 distinct states found (1,779,722 ds/min), 180,641,066 states left on queue.
+Progress(44) at 2024-11-07 04:18:07: 22,427,736,775 states generated (29,098,921 s/min), 1,754,678,716 distinct states found (1,840,704 ds/min), 180,523,907 states left on queue.
+Progress(44) at 2024-11-07 04:19:07: 22,456,749,604 states generated (29,012,829 s/min), 1,756,653,204 distinct states found (1,974,488 ds/min), 180,502,441 states left on queue.
+Progress(44) at 2024-11-07 04:20:07: 22,485,995,309 states generated (29,245,705 s/min), 1,758,406,219 distinct states found (1,753,015 ds/min), 180,303,710 states left on queue.
+Progress(44) at 2024-11-07 04:21:07: 22,515,059,607 states generated (29,064,298 s/min), 1,760,239,858 distinct states found (1,833,639 ds/min), 180,203,277 states left on queue.
+Progress(44) at 2024-11-07 04:22:07: 22,544,007,885 states generated (28,948,278 s/min), 1,761,871,023 distinct states found (1,631,165 ds/min), 179,919,396 states left on queue.
+Progress(44) at 2024-11-07 04:23:07: 22,572,858,704 states generated (28,850,819 s/min), 1,763,420,170 distinct states found (1,549,147 ds/min), 179,579,696 states left on queue.
+Progress(44) at 2024-11-07 04:24:07: 22,601,850,297 states generated (28,991,593 s/min), 1,765,118,103 distinct states found (1,697,933 ds/min), 179,386,571 states left on queue.
+Progress(44) at 2024-11-07 04:25:07: 22,630,832,111 states generated (28,981,814 s/min), 1,766,934,802 distinct states found (1,816,699 ds/min), 179,271,264 states left on queue.
+Progress(44) at 2024-11-07 04:26:07: 22,659,674,047 states generated (28,841,936 s/min), 1,768,697,425 distinct states found (1,762,623 ds/min), 179,093,059 states left on queue.
+Progress(44) at 2024-11-07 04:27:07: 22,688,427,580 states generated (28,753,533 s/min), 1,770,450,184 distinct states found (1,752,759 ds/min), 178,899,489 states left on queue.
+Progress(44) at 2024-11-07 04:28:07: 22,717,189,869 states generated (28,762,289 s/min), 1,772,256,239 distinct states found (1,806,055 ds/min), 178,731,640 states left on queue.
+Progress(44) at 2024-11-07 04:29:07: 22,746,022,343 states generated (28,832,474 s/min), 1,774,044,050 distinct states found (1,787,811 ds/min), 178,570,129 states left on queue.
+Progress(44) at 2024-11-07 04:30:07: 22,774,887,995 states generated (28,865,652 s/min), 1,775,840,059 distinct states found (1,796,009 ds/min), 178,368,886 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:31:08)
+Progress(44) at 2024-11-07 04:31:08: 22,803,877,345 states generated (28,989,350 s/min), 1,777,539,486 distinct states found (1,699,427 ds/min), 178,036,764 states left on queue.
+Progress(44) at 2024-11-07 04:32:08: 22,832,344,161 states generated (28,466,816 s/min), 1,779,071,939 distinct states found (1,532,453 ds/min), 177,609,496 states left on queue.
+Progress(44) at 2024-11-07 04:33:08: 22,860,965,708 states generated (28,621,547 s/min), 1,780,632,191 distinct states found (1,560,252 ds/min), 177,226,645 states left on queue.
+Progress(44) at 2024-11-07 04:34:08: 22,890,116,212 states generated (29,150,504 s/min), 1,782,192,671 distinct states found (1,560,480 ds/min), 176,856,967 states left on queue.
+Progress(44) at 2024-11-07 04:35:08: 22,919,394,798 states generated (29,278,586 s/min), 1,783,989,997 distinct states found (1,797,326 ds/min), 176,677,020 states left on queue.
+Progress(44) at 2024-11-07 04:36:08: 22,948,717,272 states generated (29,322,474 s/min), 1,785,769,628 distinct states found (1,779,631 ds/min), 176,466,304 states left on queue.
+Progress(44) at 2024-11-07 04:37:08: 22,978,008,874 states generated (29,291,602 s/min), 1,787,768,546 distinct states found (1,998,918 ds/min), 176,439,057 states left on queue.
+Progress(45) at 2024-11-07 04:38:08: 23,007,259,342 states generated (29,250,468 s/min), 1,789,652,180 distinct states found (1,883,634 ds/min), 176,335,868 states left on queue.
+Progress(45) at 2024-11-07 04:39:08: 23,036,414,234 states generated (29,154,892 s/min), 1,791,293,395 distinct states found (1,641,215 ds/min), 176,048,834 states left on queue.
+Progress(45) at 2024-11-07 04:40:08: 23,065,467,218 states generated (29,052,984 s/min), 1,793,176,180 distinct states found (1,882,785 ds/min), 175,945,068 states left on queue.
+Progress(45) at 2024-11-07 04:41:08: 23,094,601,413 states generated (29,134,195 s/min), 1,795,085,201 distinct states found (1,909,021 ds/min), 175,844,471 states left on queue.
+Progress(45) at 2024-11-07 04:42:08: 23,123,835,299 states generated (29,233,886 s/min), 1,796,998,629 distinct states found (1,913,428 ds/min), 175,751,026 states left on queue.
+Progress(45) at 2024-11-07 04:43:08: 23,153,014,383 states generated (29,179,084 s/min), 1,798,830,917 distinct states found (1,832,288 ds/min), 175,609,899 states left on queue.
+Progress(45) at 2024-11-07 04:44:08: 23,181,848,791 states generated (28,834,408 s/min), 1,800,688,969 distinct states found (1,858,052 ds/min), 175,482,089 states left on queue.
+Progress(45) at 2024-11-07 04:45:08: 23,210,960,242 states generated (29,111,451 s/min), 1,802,681,838 distinct states found (1,992,869 ds/min), 175,468,259 states left on queue.
+Progress(45) at 2024-11-07 04:46:08: 23,239,931,898 states generated (28,971,656 s/min), 1,804,527,297 distinct states found (1,845,459 ds/min), 175,314,676 states left on queue.
+Progress(45) at 2024-11-07 04:47:08: 23,269,110,236 states generated (29,178,338 s/min), 1,806,324,412 distinct states found (1,797,115 ds/min), 175,104,294 states left on queue.
+Progress(45) at 2024-11-07 04:48:08: 23,298,261,893 states generated (29,151,657 s/min), 1,808,026,372 distinct states found (1,701,960 ds/min), 174,789,761 states left on queue.
+Progress(45) at 2024-11-07 04:49:08: 23,327,194,301 states generated (28,932,408 s/min), 1,809,635,143 distinct states found (1,608,771 ds/min), 174,475,327 states left on queue.
+Progress(45) at 2024-11-07 04:50:08: 23,356,033,807 states generated (28,839,506 s/min), 1,811,533,685 distinct states found (1,898,542 ds/min), 174,375,697 states left on queue.
+Progress(45) at 2024-11-07 04:51:08: 23,384,783,950 states generated (28,750,143 s/min), 1,813,242,773 distinct states found (1,709,088 ds/min), 174,093,638 states left on queue.
+Progress(45) at 2024-11-07 04:52:08: 23,413,868,078 states generated (29,084,128 s/min), 1,814,921,217 distinct states found (1,678,444 ds/min), 173,816,375 states left on queue.
+Progress(45) at 2024-11-07 04:53:08: 23,443,072,326 states generated (29,204,248 s/min), 1,816,887,463 distinct states found (1,966,246 ds/min), 173,768,064 states left on queue.
+Progress(45) at 2024-11-07 04:54:08: 23,472,531,302 states generated (29,458,976 s/min), 1,818,893,389 distinct states found (2,005,926 ds/min), 173,736,986 states left on queue.
+Progress(45) at 2024-11-07 04:55:08: 23,501,670,169 states generated (29,138,867 s/min), 1,820,467,013 distinct states found (1,573,624 ds/min), 173,393,980 states left on queue.
+Progress(45) at 2024-11-07 04:56:08: 23,530,619,816 states generated (28,949,647 s/min), 1,822,153,389 distinct states found (1,686,376 ds/min), 173,102,476 states left on queue.
+Progress(45) at 2024-11-07 04:57:08: 23,559,730,839 states generated (29,111,023 s/min), 1,824,067,840 distinct states found (1,914,451 ds/min), 173,045,910 states left on queue.
+Progress(45) at 2024-11-07 04:58:08: 23,588,956,543 states generated (29,225,704 s/min), 1,826,128,132 distinct states found (2,060,292 ds/min), 173,097,456 states left on queue.
+Progress(45) at 2024-11-07 04:59:08: 23,617,943,385 states generated (28,986,842 s/min), 1,828,156,857 distinct states found (2,028,725 ds/min), 173,115,797 states left on queue.
+Progress(45) at 2024-11-07 05:00:08: 23,647,052,247 states generated (29,108,862 s/min), 1,830,116,296 distinct states found (1,959,439 ds/min), 173,061,677 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:01:09)
+Progress(45) at 2024-11-07 05:01:09: 23,676,540,644 states generated (29,488,397 s/min), 1,832,081,172 distinct states found (1,964,876 ds/min), 173,019,523 states left on queue.
+Progress(45) at 2024-11-07 05:02:09: 23,705,447,239 states generated (28,906,595 s/min), 1,834,157,962 distinct states found (2,076,790 ds/min), 173,069,444 states left on queue.
+Progress(45) at 2024-11-07 05:03:09: 23,734,590,381 states generated (29,143,142 s/min), 1,836,148,599 distinct states found (1,990,637 ds/min), 173,037,041 states left on queue.
+Progress(45) at 2024-11-07 05:04:09: 23,763,605,229 states generated (29,014,848 s/min), 1,838,302,051 distinct states found (2,153,452 ds/min), 173,135,339 states left on queue.
+Progress(45) at 2024-11-07 05:05:09: 23,792,794,847 states generated (29,189,618 s/min), 1,840,318,078 distinct states found (2,016,027 ds/min), 173,064,676 states left on queue.
+Progress(45) at 2024-11-07 05:06:09: 23,821,711,411 states generated (28,916,564 s/min), 1,842,248,819 distinct states found (1,930,741 ds/min), 172,938,116 states left on queue.
+Progress(45) at 2024-11-07 05:07:09: 23,850,829,522 states generated (29,118,111 s/min), 1,844,084,520 distinct states found (1,835,701 ds/min), 172,779,569 states left on queue.
+Progress(45) at 2024-11-07 05:08:09: 23,880,027,055 states generated (29,197,533 s/min), 1,846,207,907 distinct states found (2,123,387 ds/min), 172,876,875 states left on queue.
+Progress(45) at 2024-11-07 05:09:09: 23,909,238,654 states generated (29,211,599 s/min), 1,848,275,162 distinct states found (2,067,255 ds/min), 172,917,710 states left on queue.
+Progress(45) at 2024-11-07 05:10:09: 23,938,254,527 states generated (29,015,873 s/min), 1,850,062,508 distinct states found (1,787,346 ds/min), 172,709,939 states left on queue.
+Progress(45) at 2024-11-07 05:11:09: 23,967,280,908 states generated (29,026,381 s/min), 1,851,840,844 distinct states found (1,778,336 ds/min), 172,472,809 states left on queue.
+Progress(45) at 2024-11-07 05:12:09: 23,996,137,153 states generated (28,856,245 s/min), 1,853,907,711 distinct states found (2,066,867 ds/min), 172,514,422 states left on queue.
+Progress(45) at 2024-11-07 05:13:09: 24,025,003,271 states generated (28,866,118 s/min), 1,855,881,596 distinct states found (1,973,885 ds/min), 172,410,581 states left on queue.
+Progress(45) at 2024-11-07 05:14:09: 24,053,998,968 states generated (28,995,697 s/min), 1,857,730,142 distinct states found (1,848,546 ds/min), 172,259,468 states left on queue.
+Progress(45) at 2024-11-07 05:15:09: 24,082,780,775 states generated (28,781,807 s/min), 1,859,612,879 distinct states found (1,882,737 ds/min), 172,097,889 states left on queue.
+Progress(45) at 2024-11-07 05:16:09: 24,111,843,462 states generated (29,062,687 s/min), 1,861,479,353 distinct states found (1,866,474 ds/min), 171,938,834 states left on queue.
+Progress(45) at 2024-11-07 05:17:09: 24,140,987,153 states generated (29,143,691 s/min), 1,863,390,493 distinct states found (1,911,140 ds/min), 171,786,752 states left on queue.
+Progress(45) at 2024-11-07 05:18:09: 24,170,023,897 states generated (29,036,744 s/min), 1,864,965,603 distinct states found (1,575,110 ds/min), 171,386,848 states left on queue.
+Progress(45) at 2024-11-07 05:19:09: 24,198,987,772 states generated (28,963,875 s/min), 1,866,820,638 distinct states found (1,855,035 ds/min), 171,238,575 states left on queue.
+Progress(45) at 2024-11-07 05:20:09: 24,227,820,740 states generated (28,832,968 s/min), 1,868,623,853 distinct states found (1,803,215 ds/min), 171,005,974 states left on queue.
+Progress(45) at 2024-11-07 05:21:09: 24,256,712,636 states generated (28,891,896 s/min), 1,870,265,139 distinct states found (1,641,286 ds/min), 170,619,838 states left on queue.
+Progress(45) at 2024-11-07 05:22:09: 24,285,792,587 states generated (29,079,951 s/min), 1,871,770,548 distinct states found (1,505,409 ds/min), 170,247,019 states left on queue.
+Progress(45) at 2024-11-07 05:23:09: 24,315,021,618 states generated (29,229,031 s/min), 1,873,433,426 distinct states found (1,662,878 ds/min), 169,986,497 states left on queue.
+Progress(45) at 2024-11-07 05:24:09: 24,343,972,976 states generated (28,951,358 s/min), 1,874,958,509 distinct states found (1,525,083 ds/min), 169,639,357 states left on queue.
+Progress(45) at 2024-11-07 05:25:09: 24,372,818,044 states generated (28,845,068 s/min), 1,876,461,909 distinct states found (1,503,400 ds/min), 169,298,313 states left on queue.
+Progress(45) at 2024-11-07 05:26:09: 24,401,879,839 states generated (29,061,795 s/min), 1,878,043,093 distinct states found (1,581,184 ds/min), 169,034,999 states left on queue.
+Progress(45) at 2024-11-07 05:27:09: 24,431,117,440 states generated (29,237,601 s/min), 1,879,528,913 distinct states found (1,485,820 ds/min), 168,669,766 states left on queue.
+Progress(45) at 2024-11-07 05:28:09: 24,460,565,564 states generated (29,448,124 s/min), 1,881,382,841 distinct states found (1,853,928 ds/min), 168,585,549 states left on queue.
+Progress(45) at 2024-11-07 05:29:09: 24,489,842,320 states generated (29,276,756 s/min), 1,883,163,526 distinct states found (1,780,685 ds/min), 168,440,866 states left on queue.
+Progress(45) at 2024-11-07 05:30:09: 24,519,309,785 states generated (29,467,465 s/min), 1,884,840,978 distinct states found (1,677,452 ds/min), 168,176,100 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:31:09)
+Progress(45) at 2024-11-07 05:31:09: 24,548,699,426 states generated (29,389,641 s/min), 1,886,346,733 distinct states found (1,505,755 ds/min), 167,794,030 states left on queue.
+Progress(45) at 2024-11-07 05:32:09: 24,577,454,860 states generated (28,755,434 s/min), 1,887,761,288 distinct states found (1,414,555 ds/min), 167,342,409 states left on queue.
+Progress(45) at 2024-11-07 05:33:09: 24,606,401,929 states generated (28,947,069 s/min), 1,889,451,503 distinct states found (1,690,215 ds/min), 167,115,718 states left on queue.
+Progress(45) at 2024-11-07 05:34:09: 24,635,080,181 states generated (28,678,252 s/min), 1,891,013,080 distinct states found (1,561,577 ds/min), 166,760,395 states left on queue.
+Progress(45) at 2024-11-07 05:35:09: 24,663,912,233 states generated (28,832,052 s/min), 1,892,486,967 distinct states found (1,473,887 ds/min), 166,347,547 states left on queue.
+Progress(45) at 2024-11-07 05:36:09: 24,692,601,003 states generated (28,688,770 s/min), 1,894,014,661 distinct states found (1,527,694 ds/min), 165,980,327 states left on queue.
+Progress(45) at 2024-11-07 05:37:09: 24,721,596,280 states generated (28,995,277 s/min), 1,895,667,269 distinct states found (1,652,608 ds/min), 165,766,132 states left on queue.
+Progress(45) at 2024-11-07 05:38:09: 24,750,737,270 states generated (29,140,990 s/min), 1,897,304,588 distinct states found (1,637,319 ds/min), 165,602,331 states left on queue.
+Progress(45) at 2024-11-07 05:39:09: 24,779,762,621 states generated (29,025,351 s/min), 1,898,944,557 distinct states found (1,639,969 ds/min), 165,399,097 states left on queue.
+Progress(45) at 2024-11-07 05:40:09: 24,808,890,636 states generated (29,128,015 s/min), 1,901,039,200 distinct states found (2,094,643 ds/min), 165,505,866 states left on queue.
+Progress(45) at 2024-11-07 05:41:09: 24,837,834,330 states generated (28,943,694 s/min), 1,902,825,947 distinct states found (1,786,747 ds/min), 165,385,690 states left on queue.
+Progress(45) at 2024-11-07 05:42:09: 24,866,749,194 states generated (28,914,864 s/min), 1,904,509,048 distinct states found (1,683,101 ds/min), 165,143,394 states left on queue.
+Progress(45) at 2024-11-07 05:43:09: 24,895,891,462 states generated (29,142,268 s/min), 1,906,186,633 distinct states found (1,677,585 ds/min), 164,907,199 states left on queue.
+Progress(45) at 2024-11-07 05:44:09: 24,924,929,592 states generated (29,038,130 s/min), 1,907,774,010 distinct states found (1,587,377 ds/min), 164,567,256 states left on queue.
+Progress(45) at 2024-11-07 05:45:09: 24,953,854,731 states generated (28,925,139 s/min), 1,909,438,393 distinct states found (1,664,383 ds/min), 164,297,435 states left on queue.
+Progress(45) at 2024-11-07 05:46:09: 24,982,773,173 states generated (28,918,442 s/min), 1,911,115,370 distinct states found (1,676,977 ds/min), 164,029,981 states left on queue.
+Progress(45) at 2024-11-07 05:47:09: 25,011,681,639 states generated (28,908,466 s/min), 1,912,739,102 distinct states found (1,623,732 ds/min), 163,722,709 states left on queue.
+Progress(45) at 2024-11-07 05:48:09: 25,040,624,886 states generated (28,943,247 s/min), 1,914,465,220 distinct states found (1,726,118 ds/min), 163,504,979 states left on queue.
+Progress(45) at 2024-11-07 05:49:09: 25,069,369,631 states generated (28,744,745 s/min), 1,916,123,524 distinct states found (1,658,304 ds/min), 163,227,016 states left on queue.
+Progress(45) at 2024-11-07 05:50:09: 25,098,381,973 states generated (29,012,342 s/min), 1,917,856,454 distinct states found (1,732,930 ds/min), 163,020,213 states left on queue.
+Progress(45) at 2024-11-07 05:51:09: 25,127,432,010 states generated (29,050,037 s/min), 1,919,715,623 distinct states found (1,859,169 ds/min), 162,903,211 states left on queue.
+Progress(45) at 2024-11-07 05:52:09: 25,156,554,852 states generated (29,122,842 s/min), 1,921,381,482 distinct states found (1,665,859 ds/min), 162,640,342 states left on queue.
+Progress(45) at 2024-11-07 05:53:09: 25,185,439,752 states generated (28,884,900 s/min), 1,923,074,493 distinct states found (1,693,011 ds/min), 162,418,419 states left on queue.
+Progress(45) at 2024-11-07 05:54:09: 25,214,250,620 states generated (28,810,868 s/min), 1,924,599,166 distinct states found (1,524,673 ds/min), 162,035,736 states left on queue.
+Progress(45) at 2024-11-07 05:55:09: 25,243,065,684 states generated (28,815,064 s/min), 1,926,028,590 distinct states found (1,429,424 ds/min), 161,647,928 states left on queue.
+Progress(45) at 2024-11-07 05:56:09: 25,272,074,106 states generated (29,008,422 s/min), 1,927,788,924 distinct states found (1,760,334 ds/min), 161,469,066 states left on queue.
+Progress(45) at 2024-11-07 05:57:09: 25,300,916,527 states generated (28,842,421 s/min), 1,929,427,503 distinct states found (1,638,579 ds/min), 161,203,063 states left on queue.
+Progress(45) at 2024-11-07 05:58:09: 25,329,617,957 states generated (28,701,430 s/min), 1,931,016,200 distinct states found (1,588,697 ds/min), 160,883,828 states left on queue.
+Progress(45) at 2024-11-07 05:59:09: 25,358,305,874 states generated (28,687,917 s/min), 1,932,700,683 distinct states found (1,684,483 ds/min), 160,613,534 states left on queue.
+Progress(45) at 2024-11-07 06:00:09: 25,387,060,807 states generated (28,754,933 s/min), 1,934,352,908 distinct states found (1,652,225 ds/min), 160,340,594 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:01:10)
+Progress(45) at 2024-11-07 06:01:10: 25,416,167,383 states generated (29,106,576 s/min), 1,936,031,185 distinct states found (1,678,277 ds/min), 160,024,096 states left on queue.
+Progress(45) at 2024-11-07 06:02:10: 25,444,775,068 states generated (28,607,685 s/min), 1,937,531,864 distinct states found (1,500,679 ds/min), 159,558,759 states left on queue.
+Progress(45) at 2024-11-07 06:03:10: 25,473,218,014 states generated (28,442,946 s/min), 1,938,932,593 distinct states found (1,400,729 ds/min), 159,031,186 states left on queue.
+Progress(45) at 2024-11-07 06:04:10: 25,502,153,601 states generated (28,935,587 s/min), 1,940,366,906 distinct states found (1,434,313 ds/min), 158,550,067 states left on queue.
+Progress(45) at 2024-11-07 06:05:10: 25,531,409,924 states generated (29,256,323 s/min), 1,942,031,081 distinct states found (1,664,175 ds/min), 158,260,393 states left on queue.
+Progress(45) at 2024-11-07 06:06:10: 25,560,798,500 states generated (29,388,576 s/min), 1,943,755,697 distinct states found (1,724,616 ds/min), 158,001,838 states left on queue.
+Progress(45) at 2024-11-07 06:07:10: 25,590,101,236 states generated (29,302,736 s/min), 1,945,659,191 distinct states found (1,903,494 ds/min), 157,894,541 states left on queue.
+Progress(45) at 2024-11-07 06:08:10: 25,619,347,006 states generated (29,245,770 s/min), 1,947,436,584 distinct states found (1,777,393 ds/min), 157,703,839 states left on queue.
+Progress(45) at 2024-11-07 06:09:10: 25,648,466,795 states generated (29,119,789 s/min), 1,949,039,117 distinct states found (1,602,533 ds/min), 157,391,298 states left on queue.
+Progress(45) at 2024-11-07 06:10:10: 25,677,360,883 states generated (28,894,088 s/min), 1,950,787,656 distinct states found (1,748,539 ds/min), 157,176,854 states left on queue.
+Progress(45) at 2024-11-07 06:11:10: 25,706,625,655 states generated (29,264,772 s/min), 1,952,700,166 distinct states found (1,912,510 ds/min), 157,069,408 states left on queue.
+Progress(46) at 2024-11-07 06:12:10: 25,735,830,172 states generated (29,204,517 s/min), 1,954,444,069 distinct states found (1,743,903 ds/min), 156,852,227 states left on queue.
+Progress(46) at 2024-11-07 06:13:10: 25,764,811,792 states generated (28,981,620 s/min), 1,956,165,433 distinct states found (1,721,364 ds/min), 156,618,900 states left on queue.
+Progress(46) at 2024-11-07 06:14:10: 25,793,740,486 states generated (28,928,694 s/min), 1,957,961,862 distinct states found (1,796,429 ds/min), 156,441,787 states left on queue.
+Progress(46) at 2024-11-07 06:15:10: 25,822,741,831 states generated (29,001,345 s/min), 1,959,749,416 distinct states found (1,787,554 ds/min), 156,253,838 states left on queue.
+Progress(46) at 2024-11-07 06:16:10: 25,851,804,688 states generated (29,062,857 s/min), 1,961,466,422 distinct states found (1,717,006 ds/min), 155,977,351 states left on queue.
+Progress(46) at 2024-11-07 06:17:10: 25,880,868,584 states generated (29,063,896 s/min), 1,963,090,742 distinct states found (1,624,320 ds/min), 155,628,145 states left on queue.
+Progress(46) at 2024-11-07 06:18:10: 25,909,824,307 states generated (28,955,723 s/min), 1,964,570,100 distinct states found (1,479,358 ds/min), 155,182,107 states left on queue.
+Progress(46) at 2024-11-07 06:19:10: 25,938,584,425 states generated (28,760,118 s/min), 1,966,303,642 distinct states found (1,733,542 ds/min), 154,946,766 states left on queue.
+Progress(46) at 2024-11-07 06:20:10: 25,967,304,223 states generated (28,719,798 s/min), 1,967,883,207 distinct states found (1,579,565 ds/min), 154,558,935 states left on queue.
+Progress(46) at 2024-11-07 06:21:10: 25,996,402,469 states generated (29,098,246 s/min), 1,969,591,000 distinct states found (1,707,793 ds/min), 154,302,069 states left on queue.
+Progress(46) at 2024-11-07 06:22:10: 26,025,623,943 states generated (29,221,474 s/min), 1,971,434,403 distinct states found (1,843,403 ds/min), 154,157,059 states left on queue.
+Progress(46) at 2024-11-07 06:23:10: 26,055,038,054 states generated (29,414,111 s/min), 1,973,261,720 distinct states found (1,827,317 ds/min), 153,981,317 states left on queue.
+Progress(46) at 2024-11-07 06:24:10: 26,083,986,220 states generated (28,948,166 s/min), 1,974,670,648 distinct states found (1,408,928 ds/min), 153,508,388 states left on queue.
+Progress(46) at 2024-11-07 06:25:10: 26,113,067,907 states generated (29,081,687 s/min), 1,976,391,547 distinct states found (1,720,899 ds/min), 153,263,845 states left on queue.
+Progress(46) at 2024-11-07 06:26:10: 26,142,186,839 states generated (29,118,932 s/min), 1,978,379,881 distinct states found (1,988,334 ds/min), 153,253,200 states left on queue.
+Progress(46) at 2024-11-07 06:27:10: 26,171,338,068 states generated (29,151,229 s/min), 1,980,293,569 distinct states found (1,913,688 ds/min), 153,185,559 states left on queue.
+Progress(46) at 2024-11-07 06:28:10: 26,200,319,869 states generated (28,981,801 s/min), 1,982,130,034 distinct states found (1,836,465 ds/min), 153,039,826 states left on queue.
+Progress(46) at 2024-11-07 06:29:10: 26,229,451,237 states generated (29,131,368 s/min), 1,984,117,981 distinct states found (1,987,947 ds/min), 153,030,792 states left on queue.
+Progress(46) at 2024-11-07 06:30:10: 26,258,476,767 states generated (29,025,530 s/min), 1,985,981,073 distinct states found (1,863,092 ds/min), 152,917,939 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:31:11)
+Progress(46) at 2024-11-07 06:31:11: 26,287,657,848 states generated (29,181,081 s/min), 1,987,875,178 distinct states found (1,894,105 ds/min), 152,784,901 states left on queue.
+Progress(46) at 2024-11-07 06:32:11: 26,316,549,803 states generated (28,891,955 s/min), 1,989,821,141 distinct states found (1,945,963 ds/min), 152,728,813 states left on queue.
+Progress(46) at 2024-11-07 06:33:11: 26,345,570,902 states generated (29,021,099 s/min), 1,991,762,973 distinct states found (1,941,832 ds/min), 152,648,662 states left on queue.
+Progress(46) at 2024-11-07 06:34:11: 26,374,519,051 states generated (28,948,149 s/min), 1,993,605,958 distinct states found (1,842,985 ds/min), 152,446,201 states left on queue.
+Progress(46) at 2024-11-07 06:35:11: 26,403,403,284 states generated (28,884,233 s/min), 1,995,379,328 distinct states found (1,773,370 ds/min), 152,189,032 states left on queue.
+Progress(46) at 2024-11-07 06:36:11: 26,432,512,518 states generated (29,109,234 s/min), 1,997,205,848 distinct states found (1,826,520 ds/min), 152,060,823 states left on queue.
+Progress(46) at 2024-11-07 06:37:11: 26,461,635,963 states generated (29,123,445 s/min), 1,999,221,288 distinct states found (2,015,440 ds/min), 152,052,317 states left on queue.
+Progress(46) at 2024-11-07 06:38:11: 26,490,692,408 states generated (29,056,445 s/min), 2,001,003,940 distinct states found (1,782,652 ds/min), 151,869,333 states left on queue.
+Progress(46) at 2024-11-07 06:39:11: 26,519,611,691 states generated (28,919,283 s/min), 2,002,772,264 distinct states found (1,768,324 ds/min), 151,637,576 states left on queue.
+Progress(46) at 2024-11-07 06:40:11: 26,548,405,773 states generated (28,794,082 s/min), 2,004,530,832 distinct states found (1,758,568 ds/min), 151,415,653 states left on queue.
+Progress(46) at 2024-11-07 06:41:11: 26,577,168,173 states generated (28,762,400 s/min), 2,006,431,383 distinct states found (1,900,551 ds/min), 151,293,655 states left on queue.
+Progress(46) at 2024-11-07 06:42:11: 26,606,013,565 states generated (28,845,392 s/min), 2,008,118,930 distinct states found (1,687,547 ds/min), 150,979,607 states left on queue.
+Progress(46) at 2024-11-07 06:43:11: 26,634,840,454 states generated (28,826,889 s/min), 2,010,033,233 distinct states found (1,914,303 ds/min), 150,859,233 states left on queue.
+Progress(46) at 2024-11-07 06:44:11: 26,663,791,564 states generated (28,951,110 s/min), 2,011,764,506 distinct states found (1,731,273 ds/min), 150,592,176 states left on queue.
+Progress(46) at 2024-11-07 06:45:11: 26,692,845,560 states generated (29,053,996 s/min), 2,013,541,948 distinct states found (1,777,442 ds/min), 150,346,125 states left on queue.
+Progress(46) at 2024-11-07 06:46:11: 26,721,838,462 states generated (28,992,902 s/min), 2,015,055,311 distinct states found (1,513,363 ds/min), 149,898,025 states left on queue.
+Progress(46) at 2024-11-07 06:47:11: 26,750,784,724 states generated (28,946,262 s/min), 2,016,795,791 distinct states found (1,740,480 ds/min), 149,636,143 states left on queue.
+Progress(46) at 2024-11-07 06:48:11: 26,779,537,729 states generated (28,753,005 s/min), 2,018,420,817 distinct states found (1,625,026 ds/min), 149,264,338 states left on queue.
+Progress(46) at 2024-11-07 06:49:11: 26,808,414,064 states generated (28,876,335 s/min), 2,019,941,133 distinct states found (1,520,316 ds/min), 148,833,851 states left on queue.
+Progress(46) at 2024-11-07 06:50:11: 26,837,552,895 states generated (29,138,831 s/min), 2,021,402,334 distinct states found (1,461,201 ds/min), 148,423,082 states left on queue.
+Progress(46) at 2024-11-07 06:51:11: 26,866,488,521 states generated (28,935,626 s/min), 2,022,896,299 distinct states found (1,493,965 ds/min), 148,037,640 states left on queue.
+Progress(46) at 2024-11-07 06:52:11: 26,895,259,654 states generated (28,771,133 s/min), 2,024,306,180 distinct states found (1,409,881 ds/min), 147,623,626 states left on queue.
+Progress(46) at 2024-11-07 06:53:11: 26,924,324,639 states generated (29,064,985 s/min), 2,025,751,691 distinct states found (1,445,511 ds/min), 147,237,191 states left on queue.
+Progress(46) at 2024-11-07 06:54:11: 26,953,575,306 states generated (29,250,667 s/min), 2,027,292,041 distinct states found (1,540,350 ds/min), 146,929,253 states left on queue.
+Progress(46) at 2024-11-07 06:55:11: 26,982,863,734 states generated (29,288,428 s/min), 2,029,056,116 distinct states found (1,764,075 ds/min), 146,774,179 states left on queue.
+Progress(46) at 2024-11-07 06:56:11: 27,012,217,899 states generated (29,354,165 s/min), 2,030,705,091 distinct states found (1,648,975 ds/min), 146,523,776 states left on queue.
+Progress(46) at 2024-11-07 06:57:11: 27,041,431,406 states generated (29,213,507 s/min), 2,032,122,917 distinct states found (1,417,826 ds/min), 146,066,712 states left on queue.
+Progress(46) at 2024-11-07 06:58:11: 27,070,230,233 states generated (28,798,827 s/min), 2,033,502,867 distinct states found (1,379,950 ds/min), 145,580,465 states left on queue.
+Progress(46) at 2024-11-07 06:59:11: 27,099,119,410 states generated (28,889,177 s/min), 2,035,080,295 distinct states found (1,577,428 ds/min), 145,255,429 states left on queue.
+Progress(46) at 2024-11-07 07:00:11: 27,127,802,546 states generated (28,683,136 s/min), 2,036,480,069 distinct states found (1,399,774 ds/min), 144,763,326 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:01:11)
+Progress(46) at 2024-11-07 07:01:11: 27,156,729,000 states generated (28,926,454 s/min), 2,037,888,171 distinct states found (1,408,102 ds/min), 144,282,188 states left on queue.
+Progress(46) at 2024-11-07 07:02:11: 27,185,673,878 states generated (28,944,878 s/min), 2,039,404,499 distinct states found (1,516,328 ds/min), 143,933,899 states left on queue.
+Progress(46) at 2024-11-07 07:03:11: 27,214,800,380 states generated (29,126,502 s/min), 2,040,991,907 distinct states found (1,587,408 ds/min), 143,736,528 states left on queue.
+Progress(46) at 2024-11-07 07:04:11: 27,243,805,336 states generated (29,004,956 s/min), 2,042,560,493 distinct states found (1,568,586 ds/min), 143,474,607 states left on queue.
+Progress(46) at 2024-11-07 07:05:11: 27,272,912,902 states generated (29,107,566 s/min), 2,044,549,687 distinct states found (1,989,194 ds/min), 143,501,934 states left on queue.
+Progress(46) at 2024-11-07 07:06:11: 27,301,850,628 states generated (28,937,726 s/min), 2,046,213,816 distinct states found (1,664,129 ds/min), 143,280,971 states left on queue.
+Progress(46) at 2024-11-07 07:07:11: 27,330,744,799 states generated (28,894,171 s/min), 2,047,777,121 distinct states found (1,563,305 ds/min), 142,943,602 states left on queue.
+Progress(46) at 2024-11-07 07:08:11: 27,359,855,477 states generated (29,110,678 s/min), 2,049,356,015 distinct states found (1,578,894 ds/min), 142,631,188 states left on queue.
+Progress(46) at 2024-11-07 07:09:11: 27,388,745,464 states generated (28,889,987 s/min), 2,050,822,496 distinct states found (1,466,481 ds/min), 142,190,439 states left on queue.
+Progress(46) at 2024-11-07 07:10:11: 27,417,576,550 states generated (28,831,086 s/min), 2,052,379,523 distinct states found (1,557,027 ds/min), 141,821,153 states left on queue.
+Progress(46) at 2024-11-07 07:11:11: 27,446,546,405 states generated (28,969,855 s/min), 2,053,934,499 distinct states found (1,554,976 ds/min), 141,462,097 states left on queue.
+Progress(46) at 2024-11-07 07:12:11: 27,475,398,683 states generated (28,852,278 s/min), 2,055,510,649 distinct states found (1,576,150 ds/min), 141,116,110 states left on queue.
+Progress(46) at 2024-11-07 07:13:11: 27,504,113,194 states generated (28,714,511 s/min), 2,057,051,677 distinct states found (1,541,028 ds/min), 140,743,906 states left on queue.
+Progress(46) at 2024-11-07 07:14:11: 27,532,983,174 states generated (28,869,980 s/min), 2,058,669,649 distinct states found (1,617,972 ds/min), 140,436,853 states left on queue.
+Progress(46) at 2024-11-07 07:15:11: 27,562,088,285 states generated (29,105,111 s/min), 2,060,404,146 distinct states found (1,734,497 ds/min), 140,213,296 states left on queue.
+Progress(46) at 2024-11-07 07:16:11: 27,591,079,273 states generated (28,990,988 s/min), 2,061,979,907 distinct states found (1,575,761 ds/min), 139,895,056 states left on queue.
+Progress(46) at 2024-11-07 07:17:11: 27,619,876,413 states generated (28,797,140 s/min), 2,063,482,225 distinct states found (1,502,318 ds/min), 139,506,174 states left on queue.
+Progress(46) at 2024-11-07 07:18:11: 27,648,595,649 states generated (28,719,236 s/min), 2,064,847,355 distinct states found (1,365,130 ds/min), 139,035,783 states left on queue.
+Progress(46) at 2024-11-07 07:19:11: 27,677,544,192 states generated (28,948,543 s/min), 2,066,507,355 distinct states found (1,660,000 ds/min), 138,783,592 states left on queue.
+Progress(46) at 2024-11-07 07:20:11: 27,706,306,461 states generated (28,762,269 s/min), 2,068,019,192 distinct states found (1,511,837 ds/min), 138,418,256 states left on queue.
+Progress(46) at 2024-11-07 07:21:11: 27,734,873,733 states generated (28,567,272 s/min), 2,069,467,142 distinct states found (1,447,950 ds/min), 137,977,630 states left on queue.
+Progress(46) at 2024-11-07 07:22:11: 27,763,678,204 states generated (28,804,471 s/min), 2,071,034,824 distinct states found (1,567,682 ds/min), 137,622,296 states left on queue.
+Progress(46) at 2024-11-07 07:23:11: 27,792,322,332 states generated (28,644,128 s/min), 2,072,586,226 distinct states found (1,551,402 ds/min), 137,231,762 states left on queue.
+Progress(46) at 2024-11-07 07:24:11: 27,821,040,127 states generated (28,717,795 s/min), 2,074,030,831 distinct states found (1,444,605 ds/min), 136,731,600 states left on queue.
+Progress(46) at 2024-11-07 07:25:11: 27,849,404,654 states generated (28,364,527 s/min), 2,075,273,409 distinct states found (1,242,578 ds/min), 136,082,131 states left on queue.
+Progress(46) at 2024-11-07 07:26:11: 27,878,356,417 states generated (28,951,763 s/min), 2,076,656,601 distinct states found (1,383,192 ds/min), 135,570,796 states left on queue.
+Progress(46) at 2024-11-07 07:27:11: 27,907,776,802 states generated (29,420,385 s/min), 2,078,383,391 distinct states found (1,726,790 ds/min), 135,306,248 states left on queue.
+Progress(46) at 2024-11-07 07:28:11: 27,937,070,294 states generated (29,293,492 s/min), 2,080,076,828 distinct states found (1,693,437 ds/min), 135,034,380 states left on queue.
+Progress(46) at 2024-11-07 07:29:11: 27,966,287,907 states generated (29,217,613 s/min), 2,081,855,223 distinct states found (1,778,395 ds/min), 134,839,763 states left on queue.
+Progress(46) at 2024-11-07 07:30:11: 27,995,330,759 states generated (29,042,852 s/min), 2,083,372,197 distinct states found (1,516,974 ds/min), 134,461,641 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:31:12)
+Progress(46) at 2024-11-07 07:31:12: 28,024,387,579 states generated (29,056,820 s/min), 2,085,018,193 distinct states found (1,645,996 ds/min), 134,150,126 states left on queue.
+Progress(46) at 2024-11-07 07:32:12: 28,053,564,379 states generated (29,176,800 s/min), 2,086,850,158 distinct states found (1,831,965 ds/min), 133,983,084 states left on queue.
+Progress(46) at 2024-11-07 07:33:12: 28,082,556,747 states generated (28,992,368 s/min), 2,088,444,271 distinct states found (1,594,113 ds/min), 133,651,269 states left on queue.
+Progress(47) at 2024-11-07 07:34:12: 28,111,323,007 states generated (28,766,260 s/min), 2,090,072,790 distinct states found (1,628,519 ds/min), 133,350,640 states left on queue.
+Progress(47) at 2024-11-07 07:35:12: 28,140,191,163 states generated (28,868,156 s/min), 2,091,740,224 distinct states found (1,667,434 ds/min), 133,070,266 states left on queue.
+Progress(47) at 2024-11-07 07:36:12: 28,169,054,601 states generated (28,863,438 s/min), 2,093,375,975 distinct states found (1,635,751 ds/min), 132,752,319 states left on queue.
+Progress(47) at 2024-11-07 07:37:12: 28,197,994,162 states generated (28,939,561 s/min), 2,094,929,793 distinct states found (1,553,818 ds/min), 132,356,738 states left on queue.
+Progress(47) at 2024-11-07 07:38:12: 28,226,808,491 states generated (28,814,329 s/min), 2,096,311,441 distinct states found (1,381,648 ds/min), 131,832,292 states left on queue.
+Progress(47) at 2024-11-07 07:39:12: 28,255,451,016 states generated (28,642,525 s/min), 2,097,907,185 distinct states found (1,595,744 ds/min), 131,487,862 states left on queue.
+Progress(47) at 2024-11-07 07:40:12: 28,284,015,286 states generated (28,564,270 s/min), 2,099,332,452 distinct states found (1,425,267 ds/min), 130,982,897 states left on queue.
+Progress(47) at 2024-11-07 07:41:12: 28,313,051,806 states generated (29,036,520 s/min), 2,101,053,792 distinct states found (1,721,340 ds/min), 130,744,522 states left on queue.
+Progress(47) at 2024-11-07 07:42:12: 28,342,348,160 states generated (29,296,354 s/min), 2,102,778,970 distinct states found (1,725,178 ds/min), 130,505,777 states left on queue.
+Progress(47) at 2024-11-07 07:43:12: 28,371,533,935 states generated (29,185,775 s/min), 2,104,337,778 distinct states found (1,558,808 ds/min), 130,144,304 states left on queue.
+Progress(47) at 2024-11-07 07:44:12: 28,400,351,066 states generated (28,817,131 s/min), 2,105,835,284 distinct states found (1,497,506 ds/min), 129,719,871 states left on queue.
+Progress(47) at 2024-11-07 07:45:12: 28,429,411,463 states generated (29,060,397 s/min), 2,107,704,752 distinct states found (1,869,468 ds/min), 129,618,749 states left on queue.
+Progress(47) at 2024-11-07 07:46:12: 28,458,488,093 states generated (29,076,630 s/min), 2,109,483,825 distinct states found (1,779,073 ds/min), 129,439,723 states left on queue.
+Progress(47) at 2024-11-07 07:47:12: 28,487,338,391 states generated (28,850,298 s/min), 2,111,230,358 distinct states found (1,746,533 ds/min), 129,232,124 states left on queue.
+Progress(47) at 2024-11-07 07:48:12: 28,516,411,931 states generated (29,073,540 s/min), 2,113,150,785 distinct states found (1,920,427 ds/min), 129,168,385 states left on queue.
+Progress(47) at 2024-11-07 07:49:12: 28,545,299,037 states generated (28,887,106 s/min), 2,114,878,071 distinct states found (1,727,286 ds/min), 128,948,735 states left on queue.
+Progress(47) at 2024-11-07 07:50:12: 28,574,186,091 states generated (28,887,054 s/min), 2,116,622,746 distinct states found (1,744,675 ds/min), 128,711,386 states left on queue.
+Progress(47) at 2024-11-07 07:51:12: 28,603,057,442 states generated (28,871,351 s/min), 2,118,435,710 distinct states found (1,812,964 ds/min), 128,543,573 states left on queue.
+Progress(47) at 2024-11-07 07:52:12: 28,632,042,720 states generated (28,985,278 s/min), 2,120,240,818 distinct states found (1,805,108 ds/min), 128,349,742 states left on queue.
+Progress(47) at 2024-11-07 07:53:12: 28,660,885,097 states generated (28,842,377 s/min), 2,121,904,885 distinct states found (1,664,067 ds/min), 128,002,987 states left on queue.
+Progress(47) at 2024-11-07 07:54:12: 28,689,690,902 states generated (28,805,805 s/min), 2,123,498,767 distinct states found (1,593,882 ds/min), 127,622,035 states left on queue.
+Progress(47) at 2024-11-07 07:55:12: 28,718,827,206 states generated (29,136,304 s/min), 2,125,375,087 distinct states found (1,876,320 ds/min), 127,518,682 states left on queue.
+Progress(47) at 2024-11-07 07:56:12: 28,747,988,287 states generated (29,161,081 s/min), 2,127,234,055 distinct states found (1,858,968 ds/min), 127,390,123 states left on queue.
+Progress(47) at 2024-11-07 07:57:12: 28,776,918,449 states generated (28,930,162 s/min), 2,128,896,639 distinct states found (1,662,584 ds/min), 127,099,202 states left on queue.
+Progress(47) at 2024-11-07 07:58:12: 28,805,826,521 states generated (28,908,072 s/min), 2,130,485,896 distinct states found (1,589,257 ds/min), 126,731,846 states left on queue.
+Progress(47) at 2024-11-07 07:59:12: 28,834,550,061 states generated (28,723,540 s/min), 2,132,267,049 distinct states found (1,781,153 ds/min), 126,524,859 states left on queue.
+Progress(47) at 2024-11-07 08:00:12: 28,863,218,037 states generated (28,667,976 s/min), 2,133,901,471 distinct states found (1,634,422 ds/min), 126,149,810 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:01:13)
+Progress(47) at 2024-11-07 08:01:13: 28,892,405,277 states generated (29,187,240 s/min), 2,135,683,266 distinct states found (1,781,795 ds/min), 125,938,046 states left on queue.
+Progress(47) at 2024-11-07 08:02:13: 28,921,188,007 states generated (28,782,730 s/min), 2,137,299,589 distinct states found (1,616,323 ds/min), 125,575,223 states left on queue.
+Progress(47) at 2024-11-07 08:03:13: 28,950,198,581 states generated (29,010,574 s/min), 2,138,945,715 distinct states found (1,646,126 ds/min), 125,225,825 states left on queue.
+Progress(47) at 2024-11-07 08:04:13: 28,979,052,322 states generated (28,853,741 s/min), 2,140,384,312 distinct states found (1,438,597 ds/min), 124,739,890 states left on queue.
+Progress(47) at 2024-11-07 08:05:13: 29,007,862,556 states generated (28,810,234 s/min), 2,142,020,690 distinct states found (1,636,378 ds/min), 124,389,570 states left on queue.
+Progress(47) at 2024-11-07 08:06:13: 29,036,639,997 states generated (28,777,441 s/min), 2,143,436,769 distinct states found (1,416,079 ds/min), 123,853,456 states left on queue.
+Progress(47) at 2024-11-07 08:07:13: 29,065,681,489 states generated (29,041,492 s/min), 2,144,841,718 distinct states found (1,404,949 ds/min), 123,385,042 states left on queue.
+Progress(47) at 2024-11-07 08:08:13: 29,094,462,032 states generated (28,780,543 s/min), 2,146,214,867 distinct states found (1,373,149 ds/min), 122,908,921 states left on queue.
+Progress(47) at 2024-11-07 08:09:13: 29,123,289,758 states generated (28,827,726 s/min), 2,147,553,984 distinct states found (1,339,117 ds/min), 122,446,193 states left on queue.
+Progress(47) at 2024-11-07 08:10:13: 29,152,503,386 states generated (29,213,628 s/min), 2,148,942,911 distinct states found (1,388,927 ds/min), 122,030,640 states left on queue.
+Progress(47) at 2024-11-07 08:11:13: 29,181,728,737 states generated (29,225,351 s/min), 2,150,631,619 distinct states found (1,688,708 ds/min), 121,816,919 states left on queue.
+Progress(47) at 2024-11-07 08:12:13: 29,211,003,478 states generated (29,274,741 s/min), 2,152,175,254 distinct states found (1,543,635 ds/min), 121,489,774 states left on queue.
+Progress(47) at 2024-11-07 08:13:13: 29,240,102,268 states generated (29,098,790 s/min), 2,153,537,952 distinct states found (1,362,698 ds/min), 120,992,206 states left on queue.
+Progress(47) at 2024-11-07 08:14:13: 29,268,843,458 states generated (28,741,190 s/min), 2,154,896,522 distinct states found (1,358,570 ds/min), 120,481,830 states left on queue.
+Progress(47) at 2024-11-07 08:15:13: 29,297,458,982 states generated (28,615,524 s/min), 2,156,228,693 distinct states found (1,332,171 ds/min), 119,935,590 states left on queue.
+Progress(47) at 2024-11-07 08:16:13: 29,326,133,934 states generated (28,674,952 s/min), 2,157,558,222 distinct states found (1,329,529 ds/min), 119,402,611 states left on queue.
+Progress(47) at 2024-11-07 08:17:13: 29,355,133,179 states generated (28,999,245 s/min), 2,159,036,229 distinct states found (1,478,007 ds/min), 119,059,305 states left on queue.
+Progress(47) at 2024-11-07 08:18:13: 29,384,094,216 states generated (28,961,037 s/min), 2,160,401,726 distinct states found (1,365,497 ds/min), 118,659,528 states left on queue.
+Progress(47) at 2024-11-07 08:19:13: 29,413,210,497 states generated (29,116,281 s/min), 2,162,252,062 distinct states found (1,850,336 ds/min), 118,605,990 states left on queue.
+Progress(47) at 2024-11-07 08:20:13: 29,442,123,726 states generated (28,913,229 s/min), 2,163,968,572 distinct states found (1,716,510 ds/min), 118,430,828 states left on queue.
+Progress(47) at 2024-11-07 08:21:13: 29,470,933,813 states generated (28,810,087 s/min), 2,165,411,802 distinct states found (1,443,230 ds/min), 118,017,068 states left on queue.
+Progress(47) at 2024-11-07 08:22:13: 29,499,968,878 states generated (29,035,065 s/min), 2,166,884,069 distinct states found (1,472,267 ds/min), 117,620,342 states left on queue.
+Progress(47) at 2024-11-07 08:23:13: 29,528,752,811 states generated (28,783,933 s/min), 2,168,252,577 distinct states found (1,368,508 ds/min), 117,101,560 states left on queue.
+Progress(47) at 2024-11-07 08:24:13: 29,557,568,598 states generated (28,815,787 s/min), 2,169,705,158 distinct states found (1,452,581 ds/min), 116,651,662 states left on queue.
+Progress(47) at 2024-11-07 08:25:13: 29,586,373,945 states generated (28,805,347 s/min), 2,171,138,563 distinct states found (1,433,405 ds/min), 116,184,414 states left on queue.
+Progress(47) at 2024-11-07 08:26:13: 29,614,983,668 states generated (28,609,723 s/min), 2,172,585,802 distinct states found (1,447,239 ds/min), 115,737,683 states left on queue.
+Progress(47) at 2024-11-07 08:27:13: 29,643,800,320 states generated (28,816,652 s/min), 2,174,078,381 distinct states found (1,492,579 ds/min), 115,326,021 states left on queue.
+Progress(47) at 2024-11-07 08:28:13: 29,672,907,645 states generated (29,107,325 s/min), 2,175,677,520 distinct states found (1,599,139 ds/min), 114,997,885 states left on queue.
+Progress(47) at 2024-11-07 08:29:13: 29,701,705,556 states generated (28,797,911 s/min), 2,177,190,856 distinct states found (1,513,336 ds/min), 114,628,087 states left on queue.
+Progress(47) at 2024-11-07 08:30:13: 29,730,412,995 states generated (28,707,439 s/min), 2,178,512,609 distinct states found (1,321,753 ds/min), 114,087,841 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:31:13)
+Progress(47) at 2024-11-07 08:31:13: 29,759,387,383 states generated (28,974,388 s/min), 2,180,040,685 distinct states found (1,528,076 ds/min), 113,747,306 states left on queue.
+Progress(47) at 2024-11-07 08:32:13: 29,788,001,065 states generated (28,613,682 s/min), 2,181,419,698 distinct states found (1,379,013 ds/min), 113,276,454 states left on queue.
+Progress(47) at 2024-11-07 08:33:13: 29,816,483,253 states generated (28,482,188 s/min), 2,182,794,509 distinct states found (1,374,811 ds/min), 112,764,181 states left on queue.
+Progress(47) at 2024-11-07 08:34:13: 29,845,032,133 states generated (28,548,880 s/min), 2,184,180,577 distinct states found (1,386,068 ds/min), 112,275,854 states left on queue.
+Progress(47) at 2024-11-07 08:35:13: 29,873,704,121 states generated (28,671,988 s/min), 2,185,610,616 distinct states found (1,430,039 ds/min), 111,765,886 states left on queue.
+Progress(47) at 2024-11-07 08:36:13: 29,901,983,007 states generated (28,278,886 s/min), 2,186,742,865 distinct states found (1,132,249 ds/min), 111,037,502 states left on queue.
+Progress(47) at 2024-11-07 08:37:13: 29,931,128,222 states generated (29,145,215 s/min), 2,188,247,053 distinct states found (1,504,188 ds/min), 110,610,871 states left on queue.
+Progress(47) at 2024-11-07 08:38:13: 29,960,291,600 states generated (29,163,378 s/min), 2,189,791,380 distinct states found (1,544,327 ds/min), 110,219,347 states left on queue.
+Progress(47) at 2024-11-07 08:39:13: 29,989,426,093 states generated (29,134,493 s/min), 2,191,523,686 distinct states found (1,732,306 ds/min), 109,988,090 states left on queue.
+Progress(47) at 2024-11-07 08:40:13: 30,018,419,613 states generated (28,993,520 s/min), 2,192,983,724 distinct states found (1,460,038 ds/min), 109,567,153 states left on queue.
+Progress(47) at 2024-11-07 08:41:13: 30,047,169,261 states generated (28,749,648 s/min), 2,194,485,325 distinct states found (1,501,601 ds/min), 109,159,610 states left on queue.
+Progress(47) at 2024-11-07 08:42:13: 30,076,320,011 states generated (29,150,750 s/min), 2,196,261,852 distinct states found (1,776,527 ds/min), 108,952,775 states left on queue.
+Progress(47) at 2024-11-07 08:43:13: 30,105,246,939 states generated (28,926,928 s/min), 2,197,745,917 distinct states found (1,484,065 ds/min), 108,533,801 states left on queue.
+Progress(47) at 2024-11-07 08:44:13: 30,134,017,722 states generated (28,770,783 s/min), 2,199,274,846 distinct states found (1,528,929 ds/min), 108,138,210 states left on queue.
+Progress(48) at 2024-11-07 08:45:13: 30,162,850,009 states generated (28,832,287 s/min), 2,200,818,695 distinct states found (1,543,849 ds/min), 107,749,686 states left on queue.
+Progress(48) at 2024-11-07 08:46:13: 30,191,763,541 states generated (28,913,532 s/min), 2,202,269,881 distinct states found (1,451,186 ds/min), 107,274,074 states left on queue.
+Progress(48) at 2024-11-07 08:47:13: 30,220,450,821 states generated (28,687,280 s/min), 2,203,579,369 distinct states found (1,309,488 ds/min), 106,693,506 states left on queue.
+Progress(48) at 2024-11-07 08:48:13: 30,249,109,647 states generated (28,658,826 s/min), 2,204,980,828 distinct states found (1,401,459 ds/min), 106,171,815 states left on queue.
+Progress(48) at 2024-11-07 08:49:13: 30,278,004,502 states generated (28,894,855 s/min), 2,206,546,641 distinct states found (1,565,813 ds/min), 105,800,017 states left on queue.
+Progress(48) at 2024-11-07 08:50:13: 30,307,176,628 states generated (29,172,126 s/min), 2,208,173,395 distinct states found (1,626,754 ds/min), 105,492,735 states left on queue.
+Progress(48) at 2024-11-07 08:51:13: 30,336,267,563 states generated (29,090,935 s/min), 2,209,629,083 distinct states found (1,455,688 ds/min), 105,046,561 states left on queue.
+Progress(48) at 2024-11-07 08:52:13: 30,365,237,300 states generated (28,969,737 s/min), 2,211,221,126 distinct states found (1,592,043 ds/min), 104,707,696 states left on queue.
+Progress(48) at 2024-11-07 08:53:13: 30,394,270,909 states generated (29,033,609 s/min), 2,212,948,437 distinct states found (1,727,311 ds/min), 104,490,104 states left on queue.
+Progress(48) at 2024-11-07 08:54:13: 30,423,140,115 states generated (28,869,206 s/min), 2,214,640,116 distinct states found (1,691,679 ds/min), 104,243,061 states left on queue.
+Progress(48) at 2024-11-07 08:55:13: 30,452,062,605 states generated (28,922,490 s/min), 2,216,327,939 distinct states found (1,687,823 ds/min), 103,983,745 states left on queue.
+Progress(48) at 2024-11-07 08:56:13: 30,481,071,056 states generated (29,008,451 s/min), 2,217,983,905 distinct states found (1,655,966 ds/min), 103,702,586 states left on queue.
+Progress(48) at 2024-11-07 08:57:13: 30,509,808,031 states generated (28,736,975 s/min), 2,219,662,593 distinct states found (1,678,688 ds/min), 103,423,522 states left on queue.
+Progress(48) at 2024-11-07 08:58:13: 30,538,616,862 states generated (28,808,831 s/min), 2,221,288,821 distinct states found (1,626,228 ds/min), 103,098,334 states left on queue.
+Progress(48) at 2024-11-07 08:59:13: 30,567,539,949 states generated (28,923,087 s/min), 2,222,969,669 distinct states found (1,680,848 ds/min), 102,811,145 states left on queue.
+Progress(48) at 2024-11-07 09:00:13: 30,596,220,572 states generated (28,680,623 s/min), 2,224,451,086 distinct states found (1,481,417 ds/min), 102,320,643 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:01:14)
+Progress(48) at 2024-11-07 09:01:14: 30,625,254,005 states generated (29,033,433 s/min), 2,225,971,213 distinct states found (1,520,127 ds/min), 101,895,678 states left on queue.
+Progress(48) at 2024-11-07 09:02:14: 30,654,316,875 states generated (29,062,870 s/min), 2,227,776,007 distinct states found (1,804,794 ds/min), 101,720,925 states left on queue.
+Progress(48) at 2024-11-07 09:03:14: 30,683,368,837 states generated (29,051,962 s/min), 2,229,520,592 distinct states found (1,744,585 ds/min), 101,516,049 states left on queue.
+Progress(48) at 2024-11-07 09:04:14: 30,712,221,770 states generated (28,852,933 s/min), 2,231,006,576 distinct states found (1,485,984 ds/min), 101,059,951 states left on queue.
+Progress(48) at 2024-11-07 09:05:14: 30,740,916,958 states generated (28,695,188 s/min), 2,232,634,565 distinct states found (1,627,989 ds/min), 100,742,863 states left on queue.
+Progress(48) at 2024-11-07 09:06:14: 30,769,477,527 states generated (28,560,569 s/min), 2,234,099,495 distinct states found (1,464,930 ds/min), 100,237,731 states left on queue.
+Progress(48) at 2024-11-07 09:07:14: 30,798,306,365 states generated (28,828,838 s/min), 2,235,757,510 distinct states found (1,658,015 ds/min), 99,936,798 states left on queue.
+Progress(48) at 2024-11-07 09:08:14: 30,827,145,014 states generated (28,838,649 s/min), 2,237,323,374 distinct states found (1,565,864 ds/min), 99,542,928 states left on queue.
+Progress(48) at 2024-11-07 09:09:14: 30,855,967,384 states generated (28,822,370 s/min), 2,238,712,445 distinct states found (1,389,071 ds/min), 98,994,892 states left on queue.
+Progress(48) at 2024-11-07 09:10:14: 30,884,757,904 states generated (28,790,520 s/min), 2,240,211,537 distinct states found (1,499,092 ds/min), 98,555,003 states left on queue.
+Progress(48) at 2024-11-07 09:11:14: 30,913,436,301 states generated (28,678,397 s/min), 2,241,549,402 distinct states found (1,337,865 ds/min), 97,972,368 states left on queue.
+Progress(48) at 2024-11-07 09:12:14: 30,942,398,628 states generated (28,962,327 s/min), 2,242,894,478 distinct states found (1,345,076 ds/min), 97,450,191 states left on queue.
+Progress(48) at 2024-11-07 09:13:14: 30,971,150,912 states generated (28,752,284 s/min), 2,244,149,533 distinct states found (1,255,055 ds/min), 96,915,440 states left on queue.
+Progress(48) at 2024-11-07 09:14:14: 31,000,226,695 states generated (29,075,783 s/min), 2,245,486,253 distinct states found (1,336,720 ds/min), 96,453,711 states left on queue.
+Progress(48) at 2024-11-07 09:15:14: 31,029,410,660 states generated (29,183,965 s/min), 2,247,033,348 distinct states found (1,547,095 ds/min), 96,134,910 states left on queue.
+Progress(48) at 2024-11-07 09:16:14: 31,058,657,395 states generated (29,246,735 s/min), 2,248,447,081 distinct states found (1,413,733 ds/min), 95,701,875 states left on queue.
+Progress(48) at 2024-11-07 09:17:14: 31,087,368,874 states generated (28,711,479 s/min), 2,249,703,997 distinct states found (1,256,916 ds/min), 95,112,797 states left on queue.
+Progress(48) at 2024-11-07 09:18:14: 31,115,905,907 states generated (28,537,033 s/min), 2,250,949,093 distinct states found (1,245,096 ds/min), 94,499,889 states left on queue.
+Progress(48) at 2024-11-07 09:19:14: 31,144,578,992 states generated (28,673,085 s/min), 2,252,226,995 distinct states found (1,277,902 ds/min), 93,927,098 states left on queue.
+Progress(48) at 2024-11-07 09:20:14: 31,173,557,966 states generated (28,978,974 s/min), 2,253,602,196 distinct states found (1,375,201 ds/min), 93,561,559 states left on queue.
+Progress(48) at 2024-11-07 09:21:14: 31,202,521,307 states generated (28,963,341 s/min), 2,255,224,149 distinct states found (1,621,953 ds/min), 93,337,000 states left on queue.
+Progress(48) at 2024-11-07 09:22:14: 31,231,451,884 states generated (28,930,577 s/min), 2,256,879,564 distinct states found (1,655,415 ds/min), 93,119,996 states left on queue.
+Progress(48) at 2024-11-07 09:23:14: 31,260,174,245 states generated (28,722,361 s/min), 2,258,206,514 distinct states found (1,326,950 ds/min), 92,610,216 states left on queue.
+Progress(48) at 2024-11-07 09:24:14: 31,289,091,475 states generated (28,917,230 s/min), 2,259,564,810 distinct states found (1,358,296 ds/min), 92,123,452 states left on queue.
+Progress(48) at 2024-11-07 09:25:14: 31,317,753,943 states generated (28,662,468 s/min), 2,260,868,559 distinct states found (1,303,749 ds/min), 91,550,997 states left on queue.
+Progress(48) at 2024-11-07 09:26:14: 31,346,435,672 states generated (28,681,729 s/min), 2,262,197,433 distinct states found (1,328,874 ds/min), 91,002,731 states left on queue.
+Progress(48) at 2024-11-07 09:27:14: 31,375,074,275 states generated (28,638,603 s/min), 2,263,549,308 distinct states found (1,351,875 ds/min), 90,479,028 states left on queue.
+Progress(48) at 2024-11-07 09:28:14: 31,403,896,903 states generated (28,822,628 s/min), 2,264,999,048 distinct states found (1,449,740 ds/min), 90,030,284 states left on queue.
+Progress(48) at 2024-11-07 09:29:14: 31,432,772,052 states generated (28,875,149 s/min), 2,266,431,878 distinct states found (1,432,830 ds/min), 89,580,165 states left on queue.
+Progress(48) at 2024-11-07 09:30:14: 31,461,382,905 states generated (28,610,853 s/min), 2,267,701,315 distinct states found (1,269,437 ds/min), 89,008,135 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:31:15)
+Progress(48) at 2024-11-07 09:31:15: 31,490,350,002 states generated (28,967,097 s/min), 2,269,120,991 distinct states found (1,419,676 ds/min), 88,574,899 states left on queue.
+Progress(48) at 2024-11-07 09:32:15: 31,518,738,286 states generated (28,388,284 s/min), 2,270,333,667 distinct states found (1,212,676 ds/min), 87,950,800 states left on queue.
+Progress(48) at 2024-11-07 09:33:15: 31,547,227,429 states generated (28,489,143 s/min), 2,271,632,491 distinct states found (1,298,824 ds/min), 87,379,110 states left on queue.
+Progress(48) at 2024-11-07 09:34:15: 31,575,696,846 states generated (28,469,417 s/min), 2,272,873,166 distinct states found (1,240,675 ds/min), 86,717,955 states left on queue.
+Progress(48) at 2024-11-07 09:35:15: 31,604,509,248 states generated (28,812,402 s/min), 2,274,166,128 distinct states found (1,292,962 ds/min), 86,122,414 states left on queue.
+Progress(48) at 2024-11-07 09:36:15: 31,633,623,894 states generated (29,114,646 s/min), 2,275,690,739 distinct states found (1,524,611 ds/min), 85,718,820 states left on queue.
+Progress(48) at 2024-11-07 09:37:15: 31,662,734,164 states generated (29,110,270 s/min), 2,277,282,041 distinct states found (1,591,302 ds/min), 85,389,121 states left on queue.
+Progress(48) at 2024-11-07 09:38:15: 31,691,488,753 states generated (28,754,589 s/min), 2,278,666,982 distinct states found (1,384,941 ds/min), 84,903,119 states left on queue.
+Progress(48) at 2024-11-07 09:39:15: 31,720,428,706 states generated (28,939,953 s/min), 2,280,231,311 distinct states found (1,564,329 ds/min), 84,529,794 states left on queue.
+Progress(48) at 2024-11-07 09:40:15: 31,749,336,886 states generated (28,908,180 s/min), 2,281,688,218 distinct states found (1,456,907 ds/min), 84,091,511 states left on queue.
+Progress(48) at 2024-11-07 09:41:15: 31,778,054,342 states generated (28,717,456 s/min), 2,283,102,693 distinct states found (1,414,475 ds/min), 83,605,316 states left on queue.
+Progress(49) at 2024-11-07 09:42:15: 31,806,874,604 states generated (28,820,262 s/min), 2,284,525,902 distinct states found (1,423,209 ds/min), 83,115,134 states left on queue.
+Progress(49) at 2024-11-07 09:43:15: 31,835,557,645 states generated (28,683,041 s/min), 2,285,776,893 distinct states found (1,250,991 ds/min), 82,491,419 states left on queue.
+Progress(49) at 2024-11-07 09:44:15: 31,864,075,450 states generated (28,517,805 s/min), 2,287,028,991 distinct states found (1,252,098 ds/min), 81,847,819 states left on queue.
+Progress(49) at 2024-11-07 09:45:15: 31,892,999,186 states generated (28,923,736 s/min), 2,288,552,140 distinct states found (1,523,149 ds/min), 81,459,937 states left on queue.
+Progress(49) at 2024-11-07 09:46:15: 31,922,276,996 states generated (29,277,810 s/min), 2,290,137,668 distinct states found (1,585,528 ds/min), 81,115,285 states left on queue.
+Progress(49) at 2024-11-07 09:47:15: 31,951,109,751 states generated (28,832,755 s/min), 2,291,477,001 distinct states found (1,339,333 ds/min), 80,582,606 states left on queue.
+Progress(49) at 2024-11-07 09:48:15: 31,980,103,122 states generated (28,993,371 s/min), 2,293,149,633 distinct states found (1,672,632 ds/min), 80,321,900 states left on queue.
+Progress(49) at 2024-11-07 09:49:15: 32,008,927,227 states generated (28,824,105 s/min), 2,294,737,299 distinct states found (1,587,666 ds/min), 79,988,982 states left on queue.
+Progress(49) at 2024-11-07 09:50:15: 32,037,912,405 states generated (28,985,178 s/min), 2,296,369,269 distinct states found (1,631,970 ds/min), 79,688,340 states left on queue.
+Progress(49) at 2024-11-07 09:51:15: 32,066,650,871 states generated (28,738,466 s/min), 2,297,881,682 distinct states found (1,512,413 ds/min), 79,285,058 states left on queue.
+Progress(49) at 2024-11-07 09:52:15: 32,095,474,869 states generated (28,823,998 s/min), 2,299,386,856 distinct states found (1,505,174 ds/min), 78,860,285 states left on queue.
+Progress(49) at 2024-11-07 09:53:15: 32,124,254,306 states generated (28,779,437 s/min), 2,300,974,245 distinct states found (1,587,389 ds/min), 78,501,509 states left on queue.
+Progress(49) at 2024-11-07 09:54:15: 32,152,874,934 states generated (28,620,628 s/min), 2,302,313,494 distinct states found (1,339,249 ds/min), 77,908,264 states left on queue.
+Progress(49) at 2024-11-07 09:55:15: 32,181,625,656 states generated (28,750,722 s/min), 2,303,719,911 distinct states found (1,406,417 ds/min), 77,409,147 states left on queue.
+Progress(49) at 2024-11-07 09:56:15: 32,210,690,682 states generated (29,065,026 s/min), 2,305,458,559 distinct states found (1,738,648 ds/min), 77,178,015 states left on queue.
+Progress(49) at 2024-11-07 09:57:15: 32,239,586,160 states generated (28,895,478 s/min), 2,307,003,156 distinct states found (1,544,597 ds/min), 76,805,818 states left on queue.
+Progress(49) at 2024-11-07 09:58:15: 32,268,327,819 states generated (28,741,659 s/min), 2,308,436,891 distinct states found (1,433,735 ds/min), 76,324,212 states left on queue.
+Progress(49) at 2024-11-07 09:59:15: 32,296,829,379 states generated (28,501,560 s/min), 2,309,831,948 distinct states found (1,395,057 ds/min), 75,779,735 states left on queue.
+Progress(49) at 2024-11-07 10:00:15: 32,325,628,397 states generated (28,799,018 s/min), 2,311,380,882 distinct states found (1,548,934 ds/min), 75,395,162 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:01:15)
+Progress(49) at 2024-11-07 10:01:15: 32,354,681,149 states generated (29,052,752 s/min), 2,312,867,979 distinct states found (1,487,097 ds/min), 74,928,503 states left on queue.
+Progress(49) at 2024-11-07 10:02:15: 32,383,406,034 states generated (28,724,885 s/min), 2,314,202,265 distinct states found (1,334,286 ds/min), 74,352,680 states left on queue.
+Progress(49) at 2024-11-07 10:03:15: 32,411,997,317 states generated (28,591,283 s/min), 2,315,435,082 distinct states found (1,232,817 ds/min), 73,700,708 states left on queue.
+Progress(49) at 2024-11-07 10:04:15: 32,440,769,297 states generated (28,771,980 s/min), 2,316,687,791 distinct states found (1,252,709 ds/min), 73,114,003 states left on queue.
+Progress(49) at 2024-11-07 10:05:15: 32,469,733,062 states generated (28,963,765 s/min), 2,317,885,762 distinct states found (1,197,971 ds/min), 72,558,372 states left on queue.
+Progress(49) at 2024-11-07 10:06:15: 32,498,863,740 states generated (29,130,678 s/min), 2,319,353,511 distinct states found (1,467,749 ds/min), 72,186,248 states left on queue.
+Progress(49) at 2024-11-07 10:07:15: 32,527,902,407 states generated (29,038,667 s/min), 2,320,635,445 distinct states found (1,281,934 ds/min), 71,639,893 states left on queue.
+Progress(49) at 2024-11-07 10:08:15: 32,556,361,400 states generated (28,458,993 s/min), 2,321,793,726 distinct states found (1,158,281 ds/min), 70,954,333 states left on queue.
+Progress(49) at 2024-11-07 10:09:15: 32,585,056,251 states generated (28,694,851 s/min), 2,323,009,155 distinct states found (1,215,429 ds/min), 70,362,671 states left on queue.
+Progress(49) at 2024-11-07 10:10:15: 32,613,972,815 states generated (28,916,564 s/min), 2,324,321,084 distinct states found (1,311,929 ds/min), 69,935,186 states left on queue.
+Progress(49) at 2024-11-07 10:11:15: 32,642,963,038 states generated (28,990,223 s/min), 2,325,997,874 distinct states found (1,676,790 ds/min), 69,730,871 states left on queue.
+Progress(49) at 2024-11-07 10:12:15: 32,671,642,762 states generated (28,679,724 s/min), 2,327,294,217 distinct states found (1,296,343 ds/min), 69,221,413 states left on queue.
+Progress(49) at 2024-11-07 10:13:15: 32,700,429,296 states generated (28,786,534 s/min), 2,328,535,742 distinct states found (1,241,525 ds/min), 68,635,066 states left on queue.
+Progress(49) at 2024-11-07 10:14:15: 32,729,076,182 states generated (28,646,886 s/min), 2,329,760,071 distinct states found (1,224,329 ds/min), 67,997,735 states left on queue.
+Progress(49) at 2024-11-07 10:15:15: 32,757,631,787 states generated (28,555,605 s/min), 2,331,002,517 distinct states found (1,242,446 ds/min), 67,379,374 states left on queue.
+Progress(49) at 2024-11-07 10:16:15: 32,786,472,553 states generated (28,840,766 s/min), 2,332,364,440 distinct states found (1,361,923 ds/min), 66,856,953 states left on queue.
+Progress(49) at 2024-11-07 10:17:15: 32,815,068,782 states generated (28,596,229 s/min), 2,333,629,799 distinct states found (1,265,359 ds/min), 66,266,973 states left on queue.
+Progress(49) at 2024-11-07 10:18:15: 32,843,671,035 states generated (28,602,253 s/min), 2,334,875,787 distinct states found (1,245,988 ds/min), 65,714,901 states left on queue.
+Progress(49) at 2024-11-07 10:19:15: 32,872,127,728 states generated (28,456,693 s/min), 2,336,030,334 distinct states found (1,154,547 ds/min), 65,023,805 states left on queue.
+Progress(49) at 2024-11-07 10:20:15: 32,900,582,167 states generated (28,454,439 s/min), 2,337,180,611 distinct states found (1,150,277 ds/min), 64,304,348 states left on queue.
+Progress(49) at 2024-11-07 10:21:15: 32,929,545,972 states generated (28,963,805 s/min), 2,338,488,833 distinct states found (1,308,222 ds/min), 63,715,470 states left on queue.
+Progress(49) at 2024-11-07 10:22:15: 32,958,603,673 states generated (29,057,701 s/min), 2,339,992,330 distinct states found (1,503,497 ds/min), 63,307,968 states left on queue.
+Progress(49) at 2024-11-07 10:23:15: 32,987,442,078 states generated (28,838,405 s/min), 2,341,335,966 distinct states found (1,343,636 ds/min), 62,792,292 states left on queue.
+Progress(49) at 2024-11-07 10:24:15: 33,016,381,018 states generated (28,938,940 s/min), 2,342,828,482 distinct states found (1,492,516 ds/min), 62,365,394 states left on queue.
+Progress(49) at 2024-11-07 10:25:15: 33,045,061,128 states generated (28,680,110 s/min), 2,344,118,515 distinct states found (1,290,033 ds/min), 61,789,542 states left on queue.
+Progress(49) at 2024-11-07 10:26:15: 33,073,888,592 states generated (28,827,464 s/min), 2,345,475,829 distinct states found (1,357,314 ds/min), 61,253,128 states left on queue.
+Progress(50) at 2024-11-07 10:27:15: 33,102,491,050 states generated (28,602,458 s/min), 2,346,652,625 distinct states found (1,176,796 ds/min), 60,570,177 states left on queue.
+Progress(50) at 2024-11-07 10:28:15: 33,131,166,035 states generated (28,674,985 s/min), 2,347,941,873 distinct states found (1,289,248 ds/min), 59,969,815 states left on queue.
+Progress(50) at 2024-11-07 10:29:15: 33,160,270,838 states generated (29,104,803 s/min), 2,349,441,004 distinct states found (1,499,131 ds/min), 59,570,847 states left on queue.
+Progress(50) at 2024-11-07 10:30:15: 33,189,149,869 states generated (28,879,031 s/min), 2,350,812,706 distinct states found (1,371,702 ds/min), 59,068,202 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:31:16)
+Progress(50) at 2024-11-07 10:31:16: 33,218,286,121 states generated (29,136,252 s/min), 2,352,357,375 distinct states found (1,544,669 ds/min), 58,692,343 states left on queue.
+Progress(50) at 2024-11-07 10:32:16: 33,246,927,616 states generated (28,641,495 s/min), 2,353,796,993 distinct states found (1,439,618 ds/min), 58,245,674 states left on queue.
+Progress(50) at 2024-11-07 10:33:16: 33,275,692,609 states generated (28,764,993 s/min), 2,355,282,278 distinct states found (1,485,285 ds/min), 57,825,713 states left on queue.
+Progress(50) at 2024-11-07 10:34:16: 33,304,267,545 states generated (28,574,936 s/min), 2,356,681,270 distinct states found (1,398,992 ds/min), 57,325,849 states left on queue.
+Progress(50) at 2024-11-07 10:35:16: 33,332,888,163 states generated (28,620,618 s/min), 2,358,099,683 distinct states found (1,418,413 ds/min), 56,833,993 states left on queue.
+Progress(50) at 2024-11-07 10:36:16: 33,361,236,042 states generated (28,347,879 s/min), 2,359,281,358 distinct states found (1,181,675 ds/min), 56,126,890 states left on queue.
+Progress(50) at 2024-11-07 10:37:16: 33,390,140,655 states generated (28,904,613 s/min), 2,360,868,517 distinct states found (1,587,159 ds/min), 55,791,859 states left on queue.
+Progress(50) at 2024-11-07 10:38:16: 33,418,998,816 states generated (28,858,161 s/min), 2,362,363,780 distinct states found (1,495,263 ds/min), 55,385,255 states left on queue.
+Progress(50) at 2024-11-07 10:39:16: 33,447,612,810 states generated (28,613,994 s/min), 2,363,728,858 distinct states found (1,365,078 ds/min), 54,854,942 states left on queue.
+Progress(50) at 2024-11-07 10:40:16: 33,476,162,070 states generated (28,549,260 s/min), 2,365,099,267 distinct states found (1,370,409 ds/min), 54,312,039 states left on queue.
+Progress(50) at 2024-11-07 10:41:16: 33,504,811,505 states generated (28,649,435 s/min), 2,366,473,549 distinct states found (1,374,282 ds/min), 53,784,809 states left on queue.
+Progress(50) at 2024-11-07 10:42:16: 33,533,403,252 states generated (28,591,747 s/min), 2,367,734,253 distinct states found (1,260,704 ds/min), 53,158,819 states left on queue.
+Progress(50) at 2024-11-07 10:43:16: 33,561,952,889 states generated (28,549,637 s/min), 2,368,855,124 distinct states found (1,120,871 ds/min), 52,441,471 states left on queue.
+Progress(50) at 2024-11-07 10:44:16: 33,590,825,690 states generated (28,872,801 s/min), 2,370,054,403 distinct states found (1,199,279 ds/min), 51,878,202 states left on queue.
+Progress(50) at 2024-11-07 10:45:16: 33,619,895,477 states generated (29,069,787 s/min), 2,371,355,035 distinct states found (1,300,632 ds/min), 51,382,836 states left on queue.
+Progress(50) at 2024-11-07 10:46:16: 33,648,391,719 states generated (28,496,242 s/min), 2,372,441,699 distinct states found (1,086,664 ds/min), 50,647,071 states left on queue.
+Progress(50) at 2024-11-07 10:47:16: 33,677,074,147 states generated (28,682,428 s/min), 2,373,600,507 distinct states found (1,158,808 ds/min), 50,052,421 states left on queue.
+Progress(50) at 2024-11-07 10:48:16: 33,705,980,713 states generated (28,906,566 s/min), 2,375,050,402 distinct states found (1,449,895 ds/min), 49,692,912 states left on queue.
+Progress(50) at 2024-11-07 10:49:16: 33,734,700,309 states generated (28,719,596 s/min), 2,376,355,805 distinct states found (1,305,403 ds/min), 49,202,990 states left on queue.
+Progress(50) at 2024-11-07 10:50:16: 33,763,294,505 states generated (28,594,196 s/min), 2,377,489,014 distinct states found (1,133,209 ds/min), 48,526,991 states left on queue.
+Progress(50) at 2024-11-07 10:51:16: 33,791,781,835 states generated (28,487,330 s/min), 2,378,610,114 distinct states found (1,121,100 ds/min), 47,806,234 states left on queue.
+Progress(50) at 2024-11-07 10:52:16: 33,820,496,936 states generated (28,715,101 s/min), 2,379,861,294 distinct states found (1,251,180 ds/min), 47,194,112 states left on queue.
+Progress(50) at 2024-11-07 10:53:16: 33,848,955,580 states generated (28,458,644 s/min), 2,381,018,247 distinct states found (1,156,953 ds/min), 46,544,595 states left on queue.
+Progress(50) at 2024-11-07 10:54:16: 33,877,358,985 states generated (28,403,405 s/min), 2,382,084,162 distinct states found (1,065,915 ds/min), 45,797,353 states left on queue.
+Progress(50) at 2024-11-07 10:55:16: 33,905,938,026 states generated (28,579,041 s/min), 2,383,237,725 distinct states found (1,153,563 ds/min), 45,079,182 states left on queue.
+Progress(50) at 2024-11-07 10:56:16: 33,934,925,952 states generated (28,987,926 s/min), 2,384,648,770 distinct states found (1,411,045 ds/min), 44,602,865 states left on queue.
+Progress(50) at 2024-11-07 10:57:16: 33,963,625,658 states generated (28,699,706 s/min), 2,385,892,826 distinct states found (1,244,056 ds/min), 44,000,281 states left on queue.
+Progress(50) at 2024-11-07 10:58:16: 33,992,548,128 states generated (28,922,470 s/min), 2,387,290,030 distinct states found (1,397,204 ds/min), 43,514,140 states left on queue.
+Progress(51) at 2024-11-07 10:59:16: 34,021,202,960 states generated (28,654,832 s/min), 2,388,511,227 distinct states found (1,221,197 ds/min), 42,867,785 states left on queue.
+Progress(51) at 2024-11-07 11:00:16: 34,049,640,853 states generated (28,437,893 s/min), 2,389,565,989 distinct states found (1,054,762 ds/min), 42,084,713 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:01:17)
+Progress(51) at 2024-11-07 11:01:17: 34,079,102,421 states generated (29,461,568 s/min), 2,391,039,395 distinct states found (1,473,406 ds/min), 41,644,463 states left on queue.
+Progress(51) at 2024-11-07 11:02:17: 34,107,932,294 states generated (28,829,873 s/min), 2,392,415,920 distinct states found (1,376,525 ds/min), 41,153,999 states left on queue.
+Progress(51) at 2024-11-07 11:03:17: 34,136,619,823 states generated (28,687,529 s/min), 2,393,784,341 distinct states found (1,368,421 ds/min), 40,648,398 states left on queue.
+Progress(51) at 2024-11-07 11:04:17: 34,165,416,573 states generated (28,796,750 s/min), 2,395,186,568 distinct states found (1,402,227 ds/min), 40,162,223 states left on queue.
+Progress(51) at 2024-11-07 11:05:17: 34,193,934,145 states generated (28,517,572 s/min), 2,396,461,207 distinct states found (1,274,639 ds/min), 39,558,749 states left on queue.
+Progress(51) at 2024-11-07 11:06:17: 34,222,437,146 states generated (28,503,001 s/min), 2,397,667,005 distinct states found (1,205,798 ds/min), 38,877,170 states left on queue.
+Progress(51) at 2024-11-07 11:07:17: 34,251,162,633 states generated (28,725,487 s/min), 2,399,047,586 distinct states found (1,380,581 ds/min), 38,366,536 states left on queue.
+Progress(51) at 2024-11-07 11:08:17: 34,280,005,309 states generated (28,842,676 s/min), 2,400,476,715 distinct states found (1,429,129 ds/min), 37,912,093 states left on queue.
+Progress(51) at 2024-11-07 11:09:17: 34,308,388,681 states generated (28,383,372 s/min), 2,401,648,509 distinct states found (1,171,794 ds/min), 37,215,479 states left on queue.
+Progress(51) at 2024-11-07 11:10:17: 34,337,086,557 states generated (28,697,876 s/min), 2,403,035,913 distinct states found (1,387,404 ds/min), 36,712,331 states left on queue.
+Progress(51) at 2024-11-07 11:11:17: 34,365,565,315 states generated (28,478,758 s/min), 2,404,187,792 distinct states found (1,151,879 ds/min), 36,008,223 states left on queue.
+Progress(51) at 2024-11-07 11:12:17: 34,394,280,845 states generated (28,715,530 s/min), 2,405,264,161 distinct states found (1,076,369 ds/min), 35,318,651 states left on queue.
+Progress(51) at 2024-11-07 11:13:17: 34,423,292,173 states generated (29,011,328 s/min), 2,406,461,030 distinct states found (1,196,869 ds/min), 34,731,310 states left on queue.
+Progress(51) at 2024-11-07 11:14:17: 34,451,717,631 states generated (28,425,458 s/min), 2,407,470,263 distinct states found (1,009,233 ds/min), 33,977,845 states left on queue.
+Progress(51) at 2024-11-07 11:15:17: 34,480,582,848 states generated (28,865,217 s/min), 2,408,844,472 distinct states found (1,374,209 ds/min), 33,563,385 states left on queue.
+Progress(51) at 2024-11-07 11:16:17: 34,509,255,375 states generated (28,672,527 s/min), 2,409,992,223 distinct states found (1,147,751 ds/min), 32,948,371 states left on queue.
+Progress(51) at 2024-11-07 11:17:17: 34,537,627,156 states generated (28,371,781 s/min), 2,411,007,744 distinct states found (1,015,521 ds/min), 32,138,450 states left on queue.
+Progress(51) at 2024-11-07 11:18:17: 34,566,104,650 states generated (28,477,494 s/min), 2,412,094,834 distinct states found (1,087,090 ds/min), 31,405,790 states left on queue.
+Progress(51) at 2024-11-07 11:19:17: 34,594,468,421 states generated (28,363,771 s/min), 2,413,136,514 distinct states found (1,041,680 ds/min), 30,631,648 states left on queue.
+Progress(51) at 2024-11-07 11:20:17: 34,623,282,746 states generated (28,814,325 s/min), 2,414,376,756 distinct states found (1,240,242 ds/min), 30,011,457 states left on queue.
+Progress(51) at 2024-11-07 11:21:17: 34,652,013,328 states generated (28,730,582 s/min), 2,415,631,977 distinct states found (1,255,221 ds/min), 29,420,035 states left on queue.
+Progress(51) at 2024-11-07 11:22:17: 34,680,708,001 states generated (28,694,673 s/min), 2,416,841,149 distinct states found (1,209,172 ds/min), 28,780,239 states left on queue.
+Progress(52) at 2024-11-07 11:23:17: 34,709,197,697 states generated (28,489,696 s/min), 2,417,931,157 distinct states found (1,090,008 ds/min), 28,033,256 states left on queue.
+Progress(52) at 2024-11-07 11:24:17: 34,738,057,742 states generated (28,860,045 s/min), 2,419,214,866 distinct states found (1,283,709 ds/min), 27,476,210 states left on queue.
+Progress(52) at 2024-11-07 11:25:17: 34,766,795,719 states generated (28,737,977 s/min), 2,420,575,203 distinct states found (1,360,337 ds/min), 26,973,510 states left on queue.
+Progress(52) at 2024-11-07 11:26:17: 34,795,409,801 states generated (28,614,082 s/min), 2,421,852,170 distinct states found (1,276,967 ds/min), 26,383,152 states left on queue.
+Progress(52) at 2024-11-07 11:27:17: 34,823,871,413 states generated (28,461,612 s/min), 2,423,018,118 distinct states found (1,165,948 ds/min), 25,687,358 states left on queue.
+Progress(52) at 2024-11-07 11:28:17: 34,852,452,267 states generated (28,580,854 s/min), 2,424,258,491 distinct states found (1,240,373 ds/min), 25,061,677 states left on queue.
+Progress(52) at 2024-11-07 11:29:17: 34,881,109,110 states generated (28,656,843 s/min), 2,425,536,450 distinct states found (1,277,959 ds/min), 24,485,682 states left on queue.
+Progress(52) at 2024-11-07 11:30:17: 34,909,638,357 states generated (28,529,247 s/min), 2,426,766,241 distinct states found (1,229,791 ds/min), 23,851,800 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:31:18)
+Progress(52) at 2024-11-07 11:31:18: 34,938,217,205 states generated (28,578,848 s/min), 2,427,804,784 distinct states found (1,038,543 ds/min), 23,061,400 states left on queue.
+Progress(52) at 2024-11-07 11:32:18: 34,967,089,391 states generated (28,872,186 s/min), 2,428,907,251 distinct states found (1,102,467 ds/min), 22,421,037 states left on queue.
+Progress(52) at 2024-11-07 11:33:18: 34,995,531,710 states generated (28,442,319 s/min), 2,429,963,142 distinct states found (1,055,891 ds/min), 21,740,235 states left on queue.
+Progress(52) at 2024-11-07 11:34:18: 35,024,141,172 states generated (28,609,462 s/min), 2,431,122,150 distinct states found (1,159,008 ds/min), 21,149,288 states left on queue.
+Progress(52) at 2024-11-07 11:35:18: 35,052,351,960 states generated (28,210,788 s/min), 2,432,077,858 distinct states found (955,708 ds/min), 20,295,072 states left on queue.
+Progress(52) at 2024-11-07 11:36:18: 35,080,654,028 states generated (28,302,068 s/min), 2,433,061,991 distinct states found (984,133 ds/min), 19,478,746 states left on queue.
+Progress(52) at 2024-11-07 11:37:18: 35,109,293,099 states generated (28,639,071 s/min), 2,434,258,110 distinct states found (1,196,119 ds/min), 18,850,062 states left on queue.
+Progress(53) at 2024-11-07 11:38:18: 35,137,874,307 states generated (28,581,208 s/min), 2,435,408,538 distinct states found (1,150,428 ds/min), 18,171,042 states left on queue.
+Progress(53) at 2024-11-07 11:39:18: 35,166,493,712 states generated (28,619,405 s/min), 2,436,567,034 distinct states found (1,158,496 ds/min), 17,510,811 states left on queue.
+Progress(53) at 2024-11-07 11:40:18: 35,195,076,188 states generated (28,582,476 s/min), 2,437,810,887 distinct states found (1,243,853 ds/min), 16,916,098 states left on queue.
+Progress(53) at 2024-11-07 11:41:18: 35,223,492,769 states generated (28,416,581 s/min), 2,438,939,934 distinct states found (1,129,047 ds/min), 16,200,301 states left on queue.
+Progress(53) at 2024-11-07 11:42:18: 35,252,026,035 states generated (28,533,266 s/min), 2,440,130,151 distinct states found (1,190,217 ds/min), 15,545,447 states left on queue.
+Progress(53) at 2024-11-07 11:43:18: 35,280,482,465 states generated (28,456,430 s/min), 2,441,297,027 distinct states found (1,166,876 ds/min), 14,879,990 states left on queue.
+Progress(53) at 2024-11-07 11:44:18: 35,308,940,796 states generated (28,458,331 s/min), 2,442,317,453 distinct states found (1,020,426 ds/min), 14,116,803 states left on queue.
+Progress(53) at 2024-11-07 11:45:18: 35,337,597,306 states generated (28,656,510 s/min), 2,443,328,791 distinct states found (1,011,338 ds/min), 13,403,307 states left on queue.
+Progress(53) at 2024-11-07 11:46:18: 35,366,058,165 states generated (28,460,859 s/min), 2,444,336,498 distinct states found (1,007,707 ds/min), 12,657,418 states left on queue.
+Progress(53) at 2024-11-07 11:47:18: 35,394,499,327 states generated (28,441,162 s/min), 2,445,346,072 distinct states found (1,009,574 ds/min), 11,856,670 states left on queue.
+Progress(53) at 2024-11-07 11:48:18: 35,423,058,448 states generated (28,559,121 s/min), 2,446,449,527 distinct states found (1,103,455 ds/min), 11,150,850 states left on queue.
+Progress(54) at 2024-11-07 11:49:18: 35,451,714,950 states generated (28,656,502 s/min), 2,447,608,246 distinct states found (1,158,719 ds/min), 10,497,489 states left on queue.
+Progress(54) at 2024-11-07 11:50:18: 35,480,075,027 states generated (28,360,077 s/min), 2,448,668,413 distinct states found (1,060,167 ds/min), 9,734,924 states left on queue.
+Progress(54) at 2024-11-07 11:51:18: 35,508,544,241 states generated (28,469,214 s/min), 2,449,793,995 distinct states found (1,125,582 ds/min), 9,041,108 states left on queue.
+Progress(54) at 2024-11-07 11:52:18: 35,537,058,894 states generated (28,514,653 s/min), 2,450,835,560 distinct states found (1,041,565 ds/min), 8,304,357 states left on queue.
+Progress(54) at 2024-11-07 11:53:18: 35,565,617,770 states generated (28,558,876 s/min), 2,451,805,307 distinct states found (969,747 ds/min), 7,554,593 states left on queue.
+Progress(54) at 2024-11-07 11:54:18: 35,594,096,319 states generated (28,478,549 s/min), 2,452,829,286 distinct states found (1,023,979 ds/min), 6,777,854 states left on queue.
+Progress(55) at 2024-11-07 11:55:18: 35,622,658,049 states generated (28,561,730 s/min), 2,453,911,213 distinct states found (1,081,927 ds/min), 6,063,348 states left on queue.
+Progress(55) at 2024-11-07 11:56:18: 35,651,019,108 states generated (28,361,059 s/min), 2,454,944,844 distinct states found (1,033,631 ds/min), 5,290,297 states left on queue.
+Progress(55) at 2024-11-07 11:57:18: 35,679,577,103 states generated (28,557,995 s/min), 2,455,941,484 distinct states found (996,640 ds/min), 4,540,257 states left on queue.
+Progress(55) at 2024-11-07 11:58:18: 35,708,050,230 states generated (28,473,127 s/min), 2,456,911,566 distinct states found (970,082 ds/min), 3,737,722 states left on queue.
+Progress(55) at 2024-11-07 11:59:18: 35,736,484,911 states generated (28,434,681 s/min), 2,457,942,176 distinct states found (1,030,610 ds/min), 2,980,348 states left on queue.
+Progress(56) at 2024-11-07 12:00:18: 35,765,029,620 states generated (28,544,709 s/min), 2,458,911,346 distinct states found (969,170 ds/min), 2,201,353 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 12:01:18)
+Progress(57) at 2024-11-07 12:01:18: 35,793,733,161 states generated (28,703,541 s/min), 2,459,897,228 distinct states found (985,882 ds/min), 1,411,705 states left on queue.
+Progress(58) at 2024-11-07 12:02:18: 35,822,110,432 states generated (28,377,271 s/min), 2,460,820,961 distinct states found (923,733 ds/min), 587,430 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.5
+  based on the actual fingerprints:  val = .25
+35840434685 states generated, 2461362509 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 67.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 8 and the 95th percentile is 2).
+Finished in 20h 32min at (2024-11-07 12:03:02)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
new file mode 100644
index 0000000000..c43d52302b
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
@@ -0,0 +1,89 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 12:09:33)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36.
+Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue.
+Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue.
+Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue.
+Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue.
+Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue.
+Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue.
+Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue.
+Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue.
+Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue.
+Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue.
+Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue.
+Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue.
+Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue.
+Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue.
+Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue.
+Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue.
+Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue.
+Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue.
+Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue.
+Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue.
+Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue.
+Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue.
+Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue.
+Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue.
+Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue.
+Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue.
+Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue.
+Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue.
+Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 1.2E-4
+  based on the actual fingerprints:  val = 2.1E-6
+318172398 states generated, 7127950 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 44.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
+Finished in 28min 43s at (2024-11-06 12:38:16)

From 243bca1c49ec93444050412e460caa4659969d9c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 18:24:48 +0100
Subject: [PATCH 029/117] Bump OTel, tracing, reqwest crates (#9970)

---
 Cargo.lock | 158 ++++++++++++++++++++++++++---------------------------
 Cargo.toml |  16 +++---
 2 files changed, 86 insertions(+), 88 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ce27a7d45..ba02e3b11d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "synstructure",
 ]
 
@@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -969,7 +969,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1198,7 +1198,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1615,7 +1615,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1626,7 +1626,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1749,7 +1749,7 @@ dependencies = [
  "dsl_auto_type",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1769,7 +1769,7 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
 dependencies = [
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1792,7 +1792,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1815,7 +1815,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1947,7 +1947,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1980,7 +1980,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2234,7 +2234,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2337,7 +2337,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3142,7 +3142,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3515,9 +3515,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "opentelemetry"
-version = "0.24.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
+checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -3529,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.13.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab"
+checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3542,9 +3542,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.17.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727"
+checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
 dependencies = [
  "async-trait",
  "futures-core",
@@ -3560,9 +3560,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.7.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
+checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
 dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
@@ -3572,15 +3572,15 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.16.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05"
+checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.24.1"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
+checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
 dependencies = [
  "async-trait",
  "futures-channel",
@@ -3954,7 +3954,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4056,7 +4056,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4334,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4348,9 +4348,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -4424,7 +4424,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.52",
+ "syn 2.0.90",
  "tempfile",
 ]
 
@@ -4438,7 +4438,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4992,9 +4992,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
+checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5007,13 +5007,12 @@ dependencies = [
 
 [[package]]
 name = "reqwest-retry"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
+checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178"
 dependencies = [
  "anyhow",
  "async-trait",
- "chrono",
  "futures",
  "getrandom 0.2.11",
  "http 1.1.0",
@@ -5022,6 +5021,7 @@ dependencies = [
  "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
+ "thiserror",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -5029,9 +5029,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.5.3"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45"
+checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5047,12 +5047,10 @@ dependencies = [
 
 [[package]]
 name = "retry-policies"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
+checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c"
 dependencies = [
- "anyhow",
- "chrono",
  "rand 0.8.5",
 ]
 
@@ -5176,7 +5174,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.52",
+ "syn 2.0.90",
  "unicode-ident",
 ]
 
@@ -5684,7 +5682,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5766,7 +5764,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6139,7 +6137,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6190,9 +6188,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6222,7 +6220,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6300,27 +6298,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "thiserror"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6494,7 +6492,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6719,7 +6717,7 @@ dependencies = [
  "prost-build",
  "prost-types",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6756,9 +6754,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -6779,20 +6777,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6821,9 +6819,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
+checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
 dependencies = [
  "js-sys",
  "once_cell",
@@ -6839,9 +6837,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-serde"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
 dependencies = [
  "serde",
  "tracing-core",
@@ -6849,9 +6847,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "once_cell",
@@ -7258,7 +7256,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
@@ -7292,7 +7290,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -7669,7 +7667,7 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 2.0.52",
+ "syn 2.0.90",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
  "time",
@@ -7769,7 +7767,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7790,7 +7788,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 64c384f17a..036dc01057 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -127,10 +127,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.24"
-opentelemetry_sdk = "0.24"
-opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.16"
+opentelemetry = "0.26"
+opentelemetry_sdk = "0.26"
+opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.26"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -144,9 +144,9 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
-reqwest-middleware = "0.3.0"
-reqwest-retry = "0.5"
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-middleware = "0.4"
+reqwest-retry = "0.7"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -191,7 +191,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.25"
+tracing-opentelemetry = "0.27"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }

From 2dc238e5b3486a6f9e8d20d62731515a864d8281 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 17:54:32 +0000
Subject: [PATCH 030/117] feat(proxy): emit JWT auth method and JWT issuer in
 parquet logs (#9971)

Fix the HTTP AuthMethod to accomodate the JWT authorization method.
Introduces the JWT issuer as an additional field in the parquet logs
---
 proxy/src/auth/backend/jwt.rs         | 10 +++--
 proxy/src/context/mod.rs              |  9 +++++
 proxy/src/context/parquet.rs          | 53 +++++++++++++++------------
 proxy/src/serverless/backend.rs       |  4 ++
 proxy/src/serverless/sql_over_http.rs |  3 --
 5 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 517d4fd34b..a258090b15 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -350,6 +350,13 @@ impl JwkCacheEntryLock {
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
         let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
 
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
+
+        if let Some(iss) = &payload.issuer {
+            ctx.set_jwt_issuer(iss.as_ref().to_owned());
+        }
+
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
 
         let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
@@ -388,9 +395,6 @@ impl JwkCacheEntryLock {
             key => return Err(JwtError::UnsupportedKeyType(key.into())),
         };
 
-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
-
         tracing::debug!(?payload, "JWT signature valid with claims");
 
         if let Some(aud) = expected_audience {
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 4a063a5faa..a9fb513d3c 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -57,6 +57,7 @@ struct RequestContextInner {
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
+    jwt_issuer: Option<String>,
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
     pg_options: Option<StartupMessageParams>,
@@ -79,6 +80,7 @@ pub(crate) enum AuthMethod {
     ScramSha256,
     ScramSha256Plus,
     Cleartext,
+    Jwt,
 }
 
 impl Clone for RequestContext {
@@ -100,6 +102,7 @@ impl Clone for RequestContext {
             application: inner.application.clone(),
             error_kind: inner.error_kind,
             auth_method: inner.auth_method.clone(),
+            jwt_issuer: inner.jwt_issuer.clone(),
             success: inner.success,
             rejected: inner.rejected,
             cold_start_info: inner.cold_start_info,
@@ -148,6 +151,7 @@ impl RequestContext {
             application: None,
             error_kind: None,
             auth_method: None,
+            jwt_issuer: None,
             success: false,
             rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
@@ -246,6 +250,11 @@ impl RequestContext {
         this.auth_method = Some(auth_method);
     }
 
+    pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.jwt_issuer = Some(jwt_issuer);
+    }
+
     pub fn has_private_peer_addr(&self) -> bool {
         self.0
             .try_lock()
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b375eb886e..3105d08526 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -87,6 +87,8 @@ pub(crate) struct RequestData {
     branch: Option<String>,
     pg_options: Option<String>,
     auth_method: Option<&'static str>,
+    jwt_issuer: Option<String>,
+
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -138,7 +140,9 @@ impl From<&RequestContextInner> for RequestData {
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
+                super::AuthMethod::Jwt => "jwt",
             }),
+            jwt_issuer: value.jwt_issuer.clone(),
             protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -519,6 +523,7 @@ mod tests {
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
             pg_options: None,
             auth_method: None,
+            jwt_issuer: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -599,15 +604,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -639,11 +644,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1203465, 5, 10000),
-                (1203189, 5, 10000),
-                (1203490, 5, 10000),
-                (1203475, 5, 10000),
-                (1203729, 5, 10000)
+                (1204324, 5, 10000),
+                (1204048, 5, 10000),
+                (1204349, 5, 10000),
+                (1204334, 5, 10000),
+                (1204588, 5, 10000)
             ]
         );
 
@@ -668,15 +673,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -713,7 +718,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
+            [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 75909f3358..57846a4c2c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -53,6 +53,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -115,6 +117,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         jwt: String,
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Jwt);
+
         match &self.auth_backend {
             crate::auth::Backend::ControlPlane(console, ()) => {
                 self.config
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index afd93d02f0..a0ca7cc60d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -139,9 +139,6 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
 ) -> Result<ConnInfoWithAuth, ConnInfoError> {
-    // HTTP only uses cleartext (for now and likely always)
-    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-
     let connection_string = headers
         .get(&CONN_STRING)
         .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?

From d8ebd33fe6f3cf0fb154a380e1397ff392d9437c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 2 Dec 2024 12:06:19 -0600
Subject: [PATCH 031/117] Stop changing the value of neon.extension_server_port
 at runtime (#9972)

On reconfigure, we no longer passed a port for the extension server
which caused us to not write out the neon.extension_server_port line.
Thus, Postgres thought we were setting the port to the default value of
0. PGC_POSTMASTER GUCs cannot be set at runtime, which causes the
following log messages:

> LOG: parameter "neon.extension_server_port" cannot be changed without
restarting the server
> LOG: configuration file
"/var/db/postgres/compute/pgdata/postgresql.conf" contains errors;
unaffected changes were applied

Fixes: https://github.com/neondatabase/neon/issues/9945

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs |  9 ++-------
 compute_tools/src/compute.rs         | 19 +++++++------------
 compute_tools/src/config.rs          |  6 ++----
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b178d7abd6..e73ccd908e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -335,6 +335,7 @@ fn wait_spec(
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
+        http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -389,7 +390,6 @@ fn wait_spec(
 
     Ok(WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
     })
@@ -397,8 +397,6 @@ fn wait_spec(
 
 struct WaitSpecResult {
     compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
     resize_swap_on_bind: bool,
     set_disk_quota_for_fs: Option<String>,
 }
@@ -408,7 +406,6 @@ fn start_postgres(
     #[allow(unused_variables)] matches: &clap::ArgMatches,
     WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs,
     }: WaitSpecResult,
@@ -481,12 +478,10 @@ fn start_postgres(
         }
     }
 
-    let extension_server_port: u16 = http_port;
-
     // Start Postgres
     let mut pg = None;
     if !prestartup_failed {
-        pg = match compute.start_compute(extension_server_port) {
+        pg = match compute.start_compute() {
             Ok(pg) => Some(pg),
             Err(err) => {
                 error!("could not start the compute node: {:#}", err);
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da1caf1a9b..0d1e6d680f 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -79,6 +79,8 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
+    /// The port that the compute's HTTP server listens on
+    pub http_port: u16,
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -611,11 +613,7 @@ impl ComputeNode {
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
     #[instrument(skip_all)]
-    pub fn prepare_pgdata(
-        &self,
-        compute_state: &ComputeState,
-        extension_server_port: u16,
-    ) -> Result<()> {
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -625,7 +623,7 @@ impl ComputeNode {
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            Some(extension_server_port),
+            self.http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1243,7 +1241,7 @@ impl ComputeNode {
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
         // TODO(ololobus): We need a concurrency during reconfiguration as well,
         // but DB is already running and used by user. We can easily get out of
@@ -1284,10 +1282,7 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-        extension_server_port: u16,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
         let compute_state = self.state.lock().unwrap().clone();
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
@@ -1362,7 +1357,7 @@ impl ComputeNode {
             info!("{:?}", remote_ext_metrics);
         }
 
-        self.prepare_pgdata(&compute_state, extension_server_port)?;
+        self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index d65fe73194..b257c8a68f 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 pub fn write_postgres_conf(
     path: &Path,
     spec: &ComputeSpec,
-    extension_server_port: Option<u16>,
+    extension_server_port: u16,
 ) -> Result<()> {
     // File::create() destroys the file content if it exists.
     let mut file = File::create(path)?;
@@ -127,9 +127,7 @@ pub fn write_postgres_conf(
         writeln!(file, "# Managed by compute_ctl: end")?;
     }
 
-    if let Some(port) = extension_server_port {
-        writeln!(file, "neon.extension_server_port={}", port)?;
-    }
+    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
     // This is essential to keep this line at the end of the file,
     // because it is intended to override any settings above.

From 2e9207fdf3161799509527b6f8d4423fea718559 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 2 Dec 2024 19:46:06 +0100
Subject: [PATCH 032/117] fix(testing): Use 1 MB shared_buffers even with LFC
 (#9969)

## Problem

After enabling LFC in tests and lowering `shared_buffers` we started
having more problems with `test_pg_regress`.

## Summary of changes

Set `shared_buffers` to 1MB to both exercise getPage requests/LFC, and
still have enough room for Postgres to operate. Everything smaller might
be not enough for Postgres under load, and can cause errors like 'no
unpinned buffers available'.

See Konstantin's comment [1] as well.

Fixes #9956

[1]:
https://github.com/neondatabase/neon/issues/9956#issuecomment-2511608097
---
 control_plane/src/endpoint.rs         | 4 ++++
 test_runner/fixtures/neon_fixtures.py | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 71514daa7c..1ca6dc43c4 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -310,6 +310,10 @@ impl Endpoint {
         conf.append("wal_log_hints", "off");
         conf.append("max_replication_slots", "10");
         conf.append("hot_standby", "on");
+        // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for
+        // Postgres to operate. Everything smaller might be not enough for Postgres under load,
+        // and can cause errors like 'no unpinned buffers available', see
+        // <https://github.com/neondatabase/neon/issues/9956>
         conf.append("shared_buffers", "1MB");
         conf.append("fsync", "off");
         conf.append("max_connections", "100");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5709a3b82b..f55f06bebc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3801,13 +3801,11 @@ class Endpoint(PgProtocol, LogUtils):
                     assert size_to_bytes(size) >= size_to_bytes(
                         "1MB"
                     ), "LFC size cannot be set less than 1MB"
-            # shared_buffers = 512kB to make postgres use LFC intensively
-            # neon.max_file_cache_size and neon.file_cache size limit are
-            # set to 1MB because small LFC is better for testing (helps to find more problems)
             lfc_path_escaped = str(lfc_path).replace("'", "''")
             config_lines = [
-                "shared_buffers = 512kB",
                 f"neon.file_cache_path = '{lfc_path_escaped}'",
+                # neon.max_file_cache_size and neon.file_cache size limits are
+                # set to 1MB because small LFC is better for testing (helps to find more problems)
                 "neon.max_file_cache_size = 1MB",
                 "neon.file_cache_size_limit = 1MB",
             ] + config_lines

From aaee713e538c6541f9a54c4aef299762d5081b16 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 08:59:38 +0000
Subject: [PATCH 033/117] storcon: use proper schedule context during node
 delete (#9958)

## Problem

I was touching `test_storage_controller_node_deletion` because for AZ
scheduling work I was adding a change to the storage controller (kick
secondaries during optimisation) that made a FIXME in this test defunct.
While looking at it I also realized that we can easily fix the way node
deletion currently doesn't use a proper ScheduleContext, using the
iterator type recently added for that purpose.

## Summary of changes

- A testing-only behavior in storage controller where if a secondary
location isn't yet ready during optimisation, it will be actively
polled.
- Remove workaround in `test_storage_controller_node_deletion` that
previously was needed because optimisation would get stuck on cold
secondaries.
- Update node deletion code to use a `TenantShardContextIterator` and
thereby a proper ScheduleContext
---
 storage_controller/src/service.rs             | 112 ++++++++++++++----
 test_runner/regress/test_sharding.py          |   7 ++
 .../regress/test_storage_controller.py        |   8 +-
 3 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 631fdb4923..52c9c4710d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5158,34 +5158,38 @@ impl Service {
                 *nodes = Arc::new(nodes_mut);
             }
 
-            for (tenant_shard_id, shard) in tenants {
-                if shard.deref_node(node_id) {
-                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
-                    // it won't properly do anti-affinity.
-                    let mut schedule_context = ScheduleContext::default();
+            for (_tenant_id, mut schedule_context, shards) in
+                TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+            {
+                for shard in shards {
+                    if shard.deref_node(node_id) {
+                        if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                            // TODO: implement force flag to remove a node even if we can't reschedule
+                            // a tenant
+                            tracing::error!(
+                                "Refusing to delete node, shard {} can't be rescheduled: {e}",
+                                shard.tenant_shard_id
+                            );
+                            return Err(e.into());
+                        } else {
+                            tracing::info!(
+                                "Rescheduled shard {} away from node during deletion",
+                                shard.tenant_shard_id
+                            )
+                        }
 
-                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
-                        // TODO: implement force flag to remove a node even if we can't reschedule
-                        // a tenant
-                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
-                        return Err(e.into());
-                    } else {
-                        tracing::info!(
-                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
-                        )
+                        self.maybe_reconcile_shard(shard, nodes);
                     }
 
-                    self.maybe_reconcile_shard(shard, nodes);
+                    // Here we remove an existing observed location for the node we're removing, and it will
+                    // not be re-added by a reconciler's completion because we filter out removed nodes in
+                    // process_result.
+                    //
+                    // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                    // means any reconciles we spawned will know about the node we're deleting, enabling them
+                    // to do live migrations if it's still online.
+                    shard.observed.locations.remove(&node_id);
                 }
-
-                // Here we remove an existing observed location for the node we're removing, and it will
-                // not be re-added by a reconciler's completion because we filter out removed nodes in
-                // process_result.
-                //
-                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
-                // means any reconciles we spawned will know about the node we're deleting, enabling them
-                // to do live migrations if it's still online.
-                shard.observed.locations.remove(&node_id);
             }
 
             scheduler.node_remove(node_id);
@@ -6279,6 +6283,14 @@ impl Service {
                             > DOWNLOAD_FRESHNESS_THRESHOLD
                     {
                         tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+
+                        #[cfg(feature = "testing")]
+                        if progress.heatmap_mtime.is_none() {
+                            // No heatmap might mean the attached location has never uploaded one, or that
+                            // the secondary download hasn't happened yet.  This is relatively unusual in the field,
+                            // but fairly common in tests.
+                            self.kick_secondary_download(tenant_shard_id).await;
+                        }
                     } else {
                         // Location looks ready: proceed
                         tracing::info!(
@@ -6293,6 +6305,58 @@ impl Service {
         validated_work
     }
 
+    /// Some aspects of scheduling optimisation wait for secondary locations to be warm.  This
+    /// happens on multi-minute timescales in the field, which is fine because optimisation is meant
+    /// to be a lazy background thing. However, when testing, it is not practical to wait around, so
+    /// we have this helper to move things along faster.
+    #[cfg(feature = "testing")]
+    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
+        let (attached_node, secondary_node) = {
+            let locked = self.inner.read().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return;
+            };
+            let (Some(attached), Some(secondary)) = (
+                shard.intent.get_attached(),
+                shard.intent.get_secondary().first(),
+            ) else {
+                return;
+            };
+            (
+                locked.nodes.get(attached).unwrap().clone(),
+                locked.nodes.get(secondary).unwrap().clone(),
+            )
+        };
+
+        // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
+        // a 'kick' to let scheduling optimisation run more promptly.
+        attached_node
+            .with_client_retries(
+                |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        secondary_node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
+                        .await
+                },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+    }
+
     /// Look for shards which are oversized and in need of splitting
     async fn autosplit_tenants(self: &Arc<Self>) {
         let Some(split_threshold) = self.config.split_threshold else {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index c86ba0d4ea..30abf91d3a 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
     # We will have 2 shards per pageserver once done (including secondaries)
     neon_env_builder.num_pageservers = split_shard_count
 
+    # Two AZs
+    def assign_az(ps_cfg):
+        az = f"az-{(ps_cfg['id'] - 1) % 2}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
     # in number of 8KiB pages.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index e93e251b4f..685af5caaf 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion(
             assert victim.id not in shard["node_secondary"]
 
     # Reconciles running during deletion should all complete
-    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
-    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
-    # test that hasn't uploaded any heatmaps for secondaries.
-    # In the interim, just do a reconcile_all to enable the consistency check.
-    # env.storage_controller.reconcile_until_idle()
-    env.storage_controller.reconcile_all()
+    env.storage_controller.reconcile_until_idle()
 
     # Controller should pass its own consistency checks
     env.storage_controller.consistency_check()
@@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion(
     env.storage_controller.stop()
     env.storage_controller.start()
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
-    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
     env.storage_controller.consistency_check()
 
 

From 15d01b257ac3bf4d21347d4341fb61a147ee8ecb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 11:55:13 +0100
Subject: [PATCH 034/117] storcon_cli tenant-describe: include tenant-wide
 information in output (#9899)

Before this PR, the storcon_cli didn't have a way to show the
tenant-wide information of the TenantDescribeResponse.

Sadly, the `Serialize` impl for the tenant config doesn't skip on
`None`, so, the output becomes a bit bloated.
Maybe we can use `skip_serializing_if(Option::is_none)` in the future.
=> https://github.com/neondatabase/neon/issues/9983
---
 control_plane/storcon_cli/src/main.rs          | 16 ++++++++++++++--
 test_runner/regress/test_storage_controller.py |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b7f38c6286..e879424532 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
+            let TenantDescribeResponse {
+                tenant_id,
+                shards,
+                stripe_size,
+                policy,
+                config,
+            } = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
                     Method::GET,
                     format!("control/v1/tenant/{tenant_id}"),
                     None,
                 )
                 .await?;
-            let shards = describe_response.shards;
+            println!("Tenant {tenant_id}");
+            let mut table = comfy_table::Table::new();
+            table.add_row(["Policy", &format!("{:?}", policy)]);
+            table.add_row(["Stripe size", &format!("{:?}", stripe_size)]);
+            table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]);
+            println!("{table}");
+            println!("Shards:");
             let mut table = comfy_table::Table::new();
             table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
             for shard in shards {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 685af5caaf..244893a616 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1747,8 +1747,8 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
 
     # Describe a tenant
     tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
-    assert len(tenant_lines) == 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert len(tenant_lines) >= 3 + shard_count * 2
+    assert str(env.initial_tenant) in tenant_lines[0]
 
     # Pause changes on a tenant
     storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])

From cb10be710dd4c4dd513bb3a16a77ae2800cbc888 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 12:03:23 +0100
Subject: [PATCH 035/117] page_service: batching observability & include
 throttled time in smgr metrics (#9870)

This PR

- fixes smgr metrics https://github.com/neondatabase/neon/issues/9925
- adds an additional startup log line logging the current batching
config
- adds a histogram of batch sizes global and per-tenant
- adds a metric exposing the current batching config

The issue described #9925 is that before this PR, request latency was
only observed *after* batching.
This means that smgr latency metrics (most importantly getpage latency)
don't account for
- `wait_lsn` time
- time spent waiting for batch to fill up / the executor stage to pick
up the batch.

The fix is to use a per-request batching timer, like we did before the
initial batching PR.
We funnel those timers through the entire request lifecycle.

I noticed that even before the initial batching changes, we weren't
accounting for the time spent writing & flushing the response to the
wire.
This PR drive-by fixes that deficiency by dropping the timers at the
very end of processing the batch, i.e., after the `pgb.flush()` call.

I was **unable to maintain the behavior that we deduct
time-spent-in-throttle from various latency metrics.
The reason is that we're using a *single* counter in `RequestContext` to
track micros spent in throttle.
But there are *N* metrics timers in the batch, one per request.
As a consequence, the practice of consuming the counter in the drop
handler of each timer no longer works because all but the first timer
will encounter error `close() called on closed state`.
A failed attempt to maintain the current behavior can be found in
https://github.com/neondatabase/neon/pull/9951.

So, this PR remvoes the deduction behavior from all metrics.
I started a discussion on Slack about it the implications this has for
our internal SLO calculation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029

# Refs

- fixes https://github.com/neondatabase/neon/issues/9925
- sub-issue https://github.com/neondatabase/neon/issues/9377
- epic: https://github.com/neondatabase/neon/issues/9376
---
 pageserver/src/bin/pageserver.rs              |   3 +-
 pageserver/src/context.rs                     |   5 -
 pageserver/src/context/optional_counter.rs    | 101 ------
 pageserver/src/metrics.rs                     | 249 +++++++-------
 pageserver/src/page_service.rs                | 309 +++++++++++-------
 pageserver/src/pgdatadir_mapping.rs           |  18 +-
 pageserver/src/tenant/throttle.rs             |  17 +-
 pageserver/src/tenant/timeline.rs             |   7 +-
 test_runner/fixtures/metrics.py               |   1 +
 .../pageserver/test_page_service_batching.py  |  28 +-
 .../test_pageserver_getpage_throttle.py       |  31 +-
 11 files changed, 373 insertions(+), 396 deletions(-)
 delete mode 100644 pageserver/src/context/optional_counter.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a8c2c2e992..31f4370855 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -127,6 +127,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -302,7 +303,7 @@ fn start_pageserver(
         pageserver::metrics::tokio_epoll_uring::Collector::new(),
     ))
     .unwrap();
-    pageserver::preinitialize_metrics();
+    pageserver::preinitialize_metrics(conf);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 7afcf52cf2..8f2177fe5b 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,8 +91,6 @@
 
 use crate::task_mgr::TaskKind;
 
-pub(crate) mod optional_counter;
-
 // The main structure of this module, see module-level comment.
 #[derive(Debug)]
 pub struct RequestContext {
@@ -100,7 +98,6 @@ pub struct RequestContext {
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
-    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -158,7 +155,6 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -172,7 +168,6 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
deleted file mode 100644
index 100c649f18..0000000000
--- a/pageserver/src/context/optional_counter.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use std::{
-    sync::atomic::{AtomicU32, Ordering},
-    time::Duration,
-};
-
-#[derive(Debug)]
-pub struct CounterU32 {
-    inner: AtomicU32,
-}
-impl Default for CounterU32 {
-    fn default() -> Self {
-        Self {
-            inner: AtomicU32::new(u32::MAX),
-        }
-    }
-}
-impl CounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        match self
-            .inner
-            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
-        {
-            Ok(_) => Ok(()),
-            Err(_) => Err("open() called on clsoed state"),
-        }
-    }
-    pub fn close(&self) -> Result<u32, &'static str> {
-        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
-            u32::MAX => Err("close() called on closed state"),
-            x => Ok(x),
-        }
-    }
-
-    pub fn add(&self, count: u32) -> Result<(), &'static str> {
-        if count == 0 {
-            return Ok(());
-        }
-        let mut had_err = None;
-        self.inner
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
-                u32::MAX => {
-                    had_err = Some("add() called on closed state");
-                    None
-                }
-                x => {
-                    let (new, overflowed) = x.overflowing_add(count);
-                    if new == u32::MAX || overflowed {
-                        had_err = Some("add() overflowed the counter");
-                        None
-                    } else {
-                        Some(new)
-                    }
-                }
-            })
-            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
-            .map(|_| ())
-    }
-}
-
-#[derive(Default, Debug)]
-pub struct MicroSecondsCounterU32 {
-    inner: CounterU32,
-}
-
-impl MicroSecondsCounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        self.inner.open()
-    }
-    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
-        match duration.as_micros().try_into() {
-            Ok(x) => self.inner.add(x),
-            Err(_) => Err("add(): duration conversion error"),
-        }
-    }
-    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
-        let val = self.inner.close()?;
-        let val = Duration::from_micros(val as u64);
-        let subbed = match from.checked_sub(val) {
-            Some(v) => v,
-            None => return Err("Duration::checked_sub"),
-        };
-        Ok(subbed)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-
-    #[test]
-    fn test_basic() {
-        let counter = MicroSecondsCounterU32::default();
-        counter.open().unwrap();
-        counter.add(Duration::from_micros(23)).unwrap();
-        let res = counter
-            .close_and_checked_sub_from(Duration::from_micros(42))
-            .unwrap();
-        assert_eq!(res, Duration::from_micros(42 - 23));
-    }
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 86be97587f..d04fae7627 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -7,6 +7,10 @@ use metrics::{
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
@@ -1216,50 +1220,21 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_latency_histo: &'a Histogram,
+pub(crate) struct SmgrOpTimer {
+    global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<Histogram>,
 
-    ctx: &'c RequestContext,
-    start: std::time::Instant,
-    op: SmgrQueryType,
-    count: usize,
+    start: Instant,
 }
 
-impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
+impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(res) => res,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
-                rate_limit.call(|| {
-                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
-
-        for _ in 0..self.count {
-            self.global_latency_histo
-                .observe(ex_throttled.as_secs_f64());
-            if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
-                per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
-            }
+        let elapsed = self.start.elapsed().as_secs_f64();
+        self.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(elapsed);
         }
     }
 }
@@ -1289,6 +1264,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     global_latency: [Histogram; SmgrQueryType::COUNT],
     per_timeline_getpage_started: IntCounter,
     per_timeline_getpage_latency: Histogram,
+    global_batch_size: Histogram,
+    per_timeline_batch_size: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1381,6 +1358,76 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
+    (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
+        .map(|v| v.into())
+        .collect()
+});
+
+static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_batch_size_global",
+        "Batch size of pageserver page service requests",
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
+    let mut buckets = Vec::new();
+    for i in 0.. {
+        let bucket = 1 << i;
+        if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
+            break;
+        }
+        buckets.push(bucket.into());
+    }
+    buckets
+});
+
+static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_batch_size",
+        "Batch size of pageserver page service requests",
+        &["tenant_id", "shard_id", "timeline_id"],
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_page_service_config_max_batch_size",
+        "Configured maximum batch size for the server-side batching functionality of page_service. \
+         Labels expose more of the configuration parameters.",
+        &["mode", "execution"]
+    )
+    .expect("failed to define a metric")
+});
+
+fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
+    let (label_values, value) = match conf {
+        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            execution,
+        }) => {
+            let mode = "pipelined";
+            let execution = match execution {
+                PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                    "concurrent-futures"
+                }
+                PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
+            };
+            ([mode, execution], max_batch_size.get())
+        }
+    };
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
+        .with_label_values(&label_values)
+        .set(value.try_into().unwrap());
+}
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1416,78 +1463,51 @@ impl SmgrQueryTimePerTimeline {
             ])
             .unwrap();
 
+        let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
+        let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
             per_timeline_getpage_latency,
             per_timeline_getpage_started,
+            global_batch_size,
+            per_timeline_batch_size,
         }
     }
-    pub(crate) fn start_timer<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        self.start_timer_many(op, 1, ctx)
-    }
-    pub(crate) fn start_timer_many<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        count: usize,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        let start = Instant::now();
-
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
-        // We subtract time spent throttled from the observed latency.
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[op];
-                rate_limit.call(|| {
-                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
-
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
             self.per_timeline_getpage_started.inc();
-            Some(&self.per_timeline_getpage_latency)
+            Some(self.per_timeline_getpage_latency.clone())
         } else {
             None
         };
 
-        Some(GlobalAndPerTimelineHistogramTimer {
-            global_latency_histo: &self.global_latency[op as usize],
+        SmgrOpTimer {
+            global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
-            ctx,
-            start,
-            op,
-            count,
-        })
+            start: started_at,
+        }
+    }
+
+    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+        self.global_batch_size.observe(batch_size as f64);
+        self.per_timeline_batch_size.observe(batch_size as f64);
     }
 }
 
 #[cfg(test)]
 mod smgr_query_time_tests {
+    use std::time::Instant;
+
     use pageserver_api::shard::TenantShardId;
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
-    use crate::{
-        context::{DownloadBehavior, RequestContext},
-        task_mgr::TaskKind,
-    };
-
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1531,8 +1551,7 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-            let timer = metrics.start_timer(*op, &ctx);
+            let timer = metrics.start_smgr_op(*op, Instant::now());
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
@@ -1579,58 +1598,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     }
 });
 
-pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
     parent: &'a BasebackupQueryTime,
-    ctx: &'c RequestContext,
     start: std::time::Instant,
 }
 
 impl BasebackupQueryTime {
-    pub(crate) fn start_recording<'c: 'a, 'a>(
-        &'a self,
-        ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
+    pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
         let start = Instant::now();
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
         BasebackupQueryTimeOngoingRecording {
             parent: self,
-            ctx,
             start,
         }
     }
 }
 
-impl BasebackupQueryTimeOngoingRecording<'_, '_> {
+impl BasebackupQueryTimeOngoingRecording<'_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(ex_throttled) => ex_throttled,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
+        let elapsed = self.start.elapsed().as_secs_f64();
         // If you want to change categorize of a specific error, also change it in `log_query_error`.
         let metric = match res {
             Ok(_) => &self.parent.ok,
@@ -1641,7 +1626,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> {
             }
             Err(_) => &self.parent.error,
         };
-        metric.observe(ex_throttled.as_secs_f64());
+        metric.observe(elapsed);
     }
 }
 
@@ -2722,6 +2707,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2747,10 +2737,12 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
+use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
@@ -3562,7 +3554,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
         .set(u64::try_from(num_threads.get()).unwrap());
 }
 
-pub fn preinitialize_metrics() {
+pub fn preinitialize_metrics(conf: &'static PageServerConf) {
+    set_page_service_config_max_batch_size(&conf.page_service_pipelining);
+
     // Python tests need these and on some we do alerting.
     //
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
@@ -3630,6 +3624,7 @@ pub fn preinitialize_metrics() {
         &WAL_REDO_RECORDS_HISTOGRAM,
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1917e7f5b7..64842aa5b8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -51,7 +51,7 @@ use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self};
+use crate::metrics::{self, SmgrOpTimer};
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -540,11 +540,13 @@ impl From<WaitLsnError> for QueryError {
 enum BatchedFeMessage {
     Exists {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
@@ -552,15 +554,17 @@ enum BatchedFeMessage {
         span: Span,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
     },
     DbSize {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
@@ -632,6 +636,8 @@ impl PageServerHandler {
             msg = pgb.read_message() => { msg }
         };
 
+        let received_at = Instant::now();
+
         let copy_data_bytes = match msg? {
             Some(FeMessage::CopyData(bytes)) => bytes,
             Some(FeMessage::Terminate) => {
@@ -660,7 +666,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Exists { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                BatchedFeMessage::Exists {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::Nblocks(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
@@ -668,7 +682,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Nblocks { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                BatchedFeMessage::Nblocks {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::DbSize(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
@@ -676,7 +698,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::DbSize { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                BatchedFeMessage::DbSize {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetSlruSegment(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
@@ -684,7 +714,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::GetSlruSegment { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                BatchedFeMessage::GetSlruSegment {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
                 request_lsn,
@@ -728,6 +766,14 @@ impl PageServerHandler {
                         return respond_error!(e.into());
                     }
                 };
+
+                // It's important to start the timer before waiting for the LSN
+                // so that the _started counters are incremented before we do
+                // any serious waiting, e.g., for LSNs.
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
                     request_lsn,
@@ -747,7 +793,7 @@ impl PageServerHandler {
                     span,
                     shard,
                     effective_request_lsn,
-                    pages: smallvec::smallvec![(rel, blkno)],
+                    pages: smallvec::smallvec![(rel, blkno, timer)],
                 }
             }
         };
@@ -832,88 +878,112 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
         // invoke handler function
-        let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
-            match batch {
-                BatchedFeMessage::Exists { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
-                    (
-                        vec![
-                            self.handle_get_rel_exists_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::Nblocks { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
-                    (
-                        vec![
-                            self.handle_get_nblocks_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetPage {
+        let (handler_results, span): (
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
+            _,
+        ) = match batch {
+            BatchedFeMessage::Exists {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::exists");
+                (
+                    vec![self
+                        .handle_get_rel_exists_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
                     span,
-                    shard,
-                    effective_request_lsn,
-                    pages,
-                } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
-                    (
-                        {
-                            let npages = pages.len();
-                            trace!(npages, "handling getpage request");
-                            let res = self
-                                .handle_get_page_at_lsn_request_batched(
-                                    &shard,
-                                    effective_request_lsn,
-                                    pages,
-                                    ctx,
-                                )
-                                .instrument(span.clone())
-                                .await;
-                            assert_eq!(res.len(), npages);
-                            res
-                        },
-                        span,
-                    )
-                }
-                BatchedFeMessage::DbSize { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
-                    (
-                        vec![
-                            self.handle_db_size_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetSlruSegment { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
-                    (
-                        vec![
-                            self.handle_get_slru_segment_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::RespondError { span, error } => {
-                    // We've already decided to respond with an error, so we don't need to
-                    // call the handler.
-                    (vec![Err(error)], span)
-                }
-            };
+                )
+            }
+            BatchedFeMessage::Nblocks {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                (
+                    vec![self
+                        .handle_get_nblocks_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetPage {
+                span,
+                shard,
+                effective_request_lsn,
+                pages,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                (
+                    {
+                        let npages = pages.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_get_page_at_lsn_request_batched(
+                                &shard,
+                                effective_request_lsn,
+                                pages,
+                                ctx,
+                            )
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
+            BatchedFeMessage::DbSize {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                (
+                    vec![self
+                        .handle_db_size_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetSlruSegment {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                (
+                    vec![self
+                        .handle_get_slru_segment_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::RespondError { span, error } => {
+                // We've already decided to respond with an error, so we don't need to
+                // call the handler.
+                (vec![Err(error)], span)
+            }
+        };
 
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        let mut timers: smallvec::SmallVec<[_; 1]> =
+            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
             let response_msg = match handler_result {
                 Err(e) => match &e {
@@ -944,7 +1014,12 @@ impl PageServerHandler {
                         })
                     }
                 },
-                Ok(response_msg) => response_msg,
+                Ok((response_msg, timer)) => {
+                    // Extending the lifetime of the timers so observations on drop
+                    // include the flush time.
+                    timers.push(timer);
+                    response_msg
+                }
             };
 
             // marshal & transmit response message
@@ -961,6 +1036,7 @@ impl PageServerHandler {
                 res?;
             }
         }
+        drop(timers);
         Ok(())
     }
 
@@ -1423,10 +1499,6 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1453,10 +1525,6 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1483,10 +1551,6 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1512,26 +1576,41 @@ impl PageServerHandler {
         &mut self,
         timeline: &Timeline,
         effective_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
         ctx: &RequestContext,
-    ) -> Vec<Result<PagestreamBeMessage, PageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
-        let _timer = timeline.query_metrics.start_timer_many(
-            metrics::SmgrQueryType::GetPageAtLsn,
-            pages.len(),
-            ctx,
-        );
 
-        let pages = timeline
-            .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len());
+
+        let results = timeline
+            .get_rel_page_at_lsn_batched(
+                requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
+                effective_lsn,
+                ctx,
+            )
             .await;
+        assert_eq!(results.len(), requests.len());
 
-        Vec::from_iter(pages.into_iter().map(|page| {
-            page.map(|page| {
-                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
-            })
-            .map_err(PageStreamError::from)
-        }))
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|((_, _, timer), res)| {
+                    res.map(|page| {
+                        (
+                            PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
+                                page,
+                            }),
+                            timer,
+                        )
+                    })
+                    .map_err(PageStreamError::from)
+                }),
+        )
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -1541,10 +1620,6 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -2045,7 +2120,7 @@ where
                 COMPUTE_COMMANDS_COUNTERS
                     .for_command(ComputeCommandKind::Basebackup)
                     .inc();
-                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                 let res = async {
                     self.handle_basebackup_request(
                         pgb,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d48a1ba117..a00ec761e2 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -203,9 +203,13 @@ impl Timeline {
     ) -> Result<Bytes, PageReconstructError> {
         match version {
             Version::Lsn(effective_lsn) => {
-                let pages = smallvec::smallvec![(tag, blknum)];
+                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
-                    .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
+                    .get_rel_page_at_lsn_batched(
+                        pages.iter().map(|(tag, blknum)| (tag, blknum)),
+                        effective_lsn,
+                        ctx,
+                    )
                     .await;
                 assert_eq!(res.len(), 1);
                 res.into_iter().next().unwrap()
@@ -240,7 +244,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
         effective_lsn: Lsn,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -254,7 +258,7 @@ impl Timeline {
         let result_slots = result.spare_capacity_mut();
 
         let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
-        for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() {
+        for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -265,7 +269,7 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(tag, Version::Lsn(effective_lsn), ctx)
+                .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
                 .await
             {
                 Ok(nblocks) => nblocks,
@@ -276,7 +280,7 @@ impl Timeline {
                 }
             };
 
-            if blknum >= nblocks {
+            if *blknum >= nblocks {
                 debug!(
                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                     tag, blknum, effective_lsn, nblocks
@@ -286,7 +290,7 @@ impl Timeline {
                 continue;
             }
 
-            let key = rel_block_to_key(tag, blknum);
+            let key = rel_block_to_key(*tag, *blknum);
 
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push(response_slot_idx);
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6a80953901..7c4de55a47 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc, Mutex,
+        Arc,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::{error, warn};
+use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
@@ -162,19 +162,6 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            match ctx.micros_spent_throttled.add(wait_time) {
-                Ok(res) => res,
-                Err(error) => {
-                    use once_cell::sync::Lazy;
-                    use utils::rate_limit::RateLimit;
-                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
-                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
-                    guard.call(move || {
-                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
-                    });
-                }
-            }
             Some(wait_time)
         } else {
             None
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 730477a7f4..dc3f823f20 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1059,7 +1059,8 @@ impl Timeline {
             .map(|metric| (metric, Instant::now()));
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing `ex_throttled` below.
         let throttled = self
             .timeline_get_throttle
             .throttle(ctx, key_count as usize)
@@ -1138,7 +1139,9 @@ impl Timeline {
             .map(ScanLatencyOngoingRecording::start_recording);
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing the `ex_throttled` value in
+        // `recording.observe(throttled)` below.
         let throttled = self
             .timeline_get_throttle
             // assume scan = 1 quota for now until we find a better way to process this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 3f90c233a6..ffdbd988a5 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index c47a849fec..562094a059 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -167,18 +167,18 @@ def test_throughput(
     @dataclass
     class Metrics:
         time: float
-        pageserver_getpage_count: float
-        pageserver_vectored_get_count: float
+        pageserver_batch_size_histo_sum: float
+        pageserver_batch_size_histo_count: float
         compute_getpage_count: float
         pageserver_cpu_seconds_total: float
 
         def __sub__(self, other: "Metrics") -> "Metrics":
             return Metrics(
                 time=self.time - other.time,
-                pageserver_getpage_count=self.pageserver_getpage_count
-                - other.pageserver_getpage_count,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count
-                - other.pageserver_vectored_get_count,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
+                - other.pageserver_batch_size_histo_sum,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
+                - other.pageserver_batch_size_histo_count,
                 compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                 - other.pageserver_cpu_seconds_total,
@@ -187,8 +187,8 @@ def test_throughput(
         def normalize(self, by) -> "Metrics":
             return Metrics(
                 time=self.time / by,
-                pageserver_getpage_count=self.pageserver_getpage_count / by,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                 compute_getpage_count=self.compute_getpage_count / by,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
             )
@@ -202,11 +202,11 @@ def test_throughput(
             pageserver_metrics = ps_http.get_metrics()
             return Metrics(
                 time=time.time(),
-                pageserver_getpage_count=pageserver_metrics.query_one(
-                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_sum"
                 ).value,
-                pageserver_vectored_get_count=pageserver_metrics.query_one(
-                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                pageserver_batch_size_histo_count=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_count"
                 ).value,
                 compute_getpage_count=compute_getpage_count,
                 pageserver_cpu_seconds_total=pageserver_metrics.query_one(
@@ -243,7 +243,7 @@ def test_throughput(
     # Sanity-checks on the collected data
     #
     # assert that getpage counts roughly match between compute and ps
-    assert metrics.pageserver_getpage_count == pytest.approx(
+    assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
         metrics.compute_getpage_count, rel=0.01
     )
 
@@ -256,7 +256,7 @@ def test_throughput(
 
     zenbenchmark.record(
         "perfmetric.batching_factor",
-        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
         unit="",
         report=MetricReport.HIGHER_IS_BETTER,
     )
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index ba6a1d9045..62aec50a9e 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -4,6 +4,7 @@ import copy
 import json
 import uuid
 
+import pytest
 from anyio import Path
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
@@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
 
     log.info("warmup / make sure metrics are present")
     run_pagebench_at_max_speed_and_get_total_requests_completed(2)
-    metrics_query = {
+    smgr_metrics_query = {
         "tenant_id": str(tenant_id),
         "timeline_id": str(timeline_id),
         "smgr_query_type": "get_page_at_lsn",
     }
-    metric_name = "pageserver_smgr_query_seconds_sum"
-    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    smgr_metric_name = "pageserver_smgr_query_seconds_sum"
+    throttle_metrics_query = {
+        "tenant_id": str(tenant_id),
+    }
+    throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
+
+    smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_pre is not None
+    throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_pre is not None
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
@@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    log.info("the smgr metric includes throttle time")
+    smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
+    throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_post is not None
     actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+    actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
+    actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
     assert (
-        duration_secs >= 10 * actual_smgr_query_seconds
-    ), "smgr metrics should not include throttle wait time"
+        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
+    ), "smgr metrics include throttle wait time"
+    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
+    assert smgr_ex_throttle > 0
+    assert (
+        duration_secs > 10 * smgr_ex_throttle
+    ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 
 throttle_config_with_field_fair_set = {

From a2a942f93cedf6cbba6ea3184d39dfffe250dd47 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:25:29 +0100
Subject: [PATCH 036/117] Add support for the extensions test for Postgres v17
 (#9748)

## Problem
The extensions for Postgres v17 are ready but we do not test the
extensions shipped with v17
## Summary of changes
Build the test image based on Postgres v17. Run the tests for v17.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 .github/workflows/build_and_test.yml          |  15 +-
 compute/compute-node.Dockerfile               |  42 ++---
 ...hint_plan.patch => pg_hint_plan_v16.patch} |   0
 compute/patches/pg_hint_plan_v17.patch        | 174 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile     |   6 +-
 docker-compose/docker_compose_test.sh         |  27 ++-
 6 files changed, 219 insertions(+), 45 deletions(-)
 rename compute/patches/{pg_hint_plan.patch => pg_hint_plan_v16.patch} (100%)
 create mode 100644 compute/patches/pg_hint_plan_v17.patch

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9830c2a0c9..e9e111e7bd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -669,7 +669,7 @@ jobs:
             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
       - name: Build neon extensions test image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -684,8 +684,7 @@ jobs:
           pull: true
           file: compute/compute-node.Dockerfile
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
@@ -708,7 +707,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
@@ -744,7 +743,7 @@ jobs:
                                              neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - name: Create multi-arch neon-test-extensions image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         run: |
           docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                           -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
@@ -833,6 +832,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [ x64, arm64 ]
+        pg_version: [v16, v17]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
@@ -871,7 +871,10 @@ jobs:
 
       - name: Verify docker-compose example and test extensions
         timeout-minutes: 20
-        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        env:
+          TAG: ${{needs.tag.outputs.build-tag}}
+          TEST_VERSION_ONLY: ${{ matrix.pg_version }}
+        run: ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up
         if: always()
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 222a0cb88b..bf6311bf2b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1367,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    mkdir /ext-src
+RUN mkdir /ext-src
 
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
-COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
 COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
@@ -1395,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY compute/patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -1405,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
+COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
 COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/ && for f in *.tar.gz; \
+RUN cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_anon.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_cron.patch
+    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
+    esac && patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/compute/patches/pg_hint_plan.patch b/compute/patches/pg_hint_plan_v16.patch
similarity index 100%
rename from compute/patches/pg_hint_plan.patch
rename to compute/patches/pg_hint_plan_v16.patch
diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch
new file mode 100644
index 0000000000..dbf4e470ea
--- /dev/null
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -0,0 +1,174 @@
+diff --git a/expected/ut-A.out b/expected/ut-A.out
+index e7d68a1..65a056c 100644
+--- a/expected/ut-A.out
++++ b/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/expected/ut-J.out b/expected/ut-J.out
+index 2fa3c70..314e929 100644
+--- a/expected/ut-J.out
++++ b/expected/ut-J.out
+@@ -789,38 +789,6 @@ NestLoop(st1 st2)
+ MergeJoin(t1 t2)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+ error hint:
+ 
+                                        explain_filter                                        
+diff --git a/expected/ut-S.out b/expected/ut-S.out
+index 0bfcfb8..e75f581 100644
+--- a/expected/ut-S.out
++++ b/expected/ut-S.out
+@@ -4415,34 +4415,6 @@ used hint:
+ IndexScan(ti1 ti1_pred)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                     
+diff --git a/expected/ut-W.out b/expected/ut-W.out
+index a09bd34..0ad227c 100644
+--- a/expected/ut-W.out
++++ b/expected/ut-W.out
+@@ -1341,54 +1341,6 @@ IndexScan(ft1)
+ IndexScan(t)
+ Parallel(s1 3 hard)
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                    
+diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
+index 017fa4b..98d989b 100644
+--- a/expected/ut-fdw.out
++++ b/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 8378f37b48..05a2cf124c 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -4,14 +4,16 @@ ARG TAG=latest
 
 FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 
+ARG COMPUTE_IMAGE
+
 USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
                        python3-pip \
-                       netcat
+                       netcat-openbsd
 #Faker is required for the pg_anon test
-RUN pip3 install Faker
+RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 10805a9952..c97dfaa901 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,10 +30,17 @@ cleanup() {
     docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
 
-for pg_version in 14 15 16; do
+for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
+    pg_version=${pg_version/v/}
     echo "clean up containers if exists"
     cleanup
-    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
+      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
+      jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json
+    fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
@@ -54,8 +61,7 @@ for pg_version in 14 15 16; do
         fi
     done
 
-    if [ $pg_version -ge 16 ]
-    then
+    if [ $pg_version -ge 16 ]; then
         echo Enabling trust connection
         docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
         echo Adding postgres role
@@ -68,10 +74,13 @@ for pg_version in 14 15 16; do
         # The test assumes that it is running on the same host with the postgres engine.
         # In our case it's not true, that's why we are copying files to the compute node
         TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        # Add support for pg_anon for pg_v16
+        if [ $pg_version -ne 17 ]; then
+          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
         rm -rf $TMPDIR
+        fi
         TMPDIR=$(mktemp -d)
         # The following block does the same for the pg_hintplan test
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
@@ -97,4 +106,8 @@ for pg_version in 14 15 16; do
         fi
     fi
     cleanup
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json
+    fi
 done

From dcb24ce170573a2ae6ed29467669d03c73b589e6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 12:35:59 +0100
Subject: [PATCH 037/117] safekeeper,pageserver: add heap profiling (#9778)

## Problem

We don't have good observability for memory usage. This would be useful
e.g. to debug OOM incidents or optimize performance or resource usage.

We would also like to use continuous profiling with e.g. [Grafana Cloud
Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/)
(see https://github.com/neondatabase/cloud/issues/14888).

This PR is intended as a proof of concept, to try it out in staging and
drive further discussions about profiling more broadly.

Touches https://github.com/neondatabase/neon/issues/9534.
Touches https://github.com/neondatabase/cloud/issues/14888.
Depends on #9779.
Depends on #9780.

## Summary of changes

Adds a HTTP route `/profile/heap` that takes a heap profile and returns
it. Query parameters:

* `format`: output format (`jemalloc` or `pprof`; default `pprof`).

Unlike CPU profiles (see #9764), heap profiles are not symbolized and
require the original binary to translate addresses to function names. To
make this work with Grafana, we'll probably have to symbolize the
process server-side -- this is left as future work, as is other output
formats like SVG.

Heap profiles don't work on macOS due to limitations in jemalloc.
---
 Cargo.lock                        | 89 ++++++++++++++++++++++++-------
 Cargo.toml                        |  3 +-
 libs/utils/Cargo.toml             |  1 +
 libs/utils/src/http/endpoint.rs   | 64 ++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  5 ++
 pageserver/src/http/routes.rs     |  8 +--
 safekeeper/benches/receive_wal.rs |  6 +++
 safekeeper/src/bin/safekeeper.rs  |  5 ++
 safekeeper/src/http/routes.rs     |  7 ++-
 workspace_hack/Cargo.toml         | 15 ++++--
 10 files changed, 175 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba02e3b11d..b2769e59f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -301,7 +301,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "http 0.2.9",
  "hyper 0.14.30",
@@ -341,7 +341,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "http 0.2.9",
  "http-body 0.4.5",
  "once_cell",
@@ -417,7 +417,7 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "hmac",
  "http 0.2.9",
@@ -621,7 +621,7 @@ dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
@@ -2054,9 +2054,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 
 [[package]]
 name = "ff"
@@ -2912,6 +2912,23 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
+[[package]]
+name = "jemalloc_pprof"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+dependencies = [
+ "anyhow",
+ "libc",
+ "mappings",
+ "once_cell",
+ "pprof_util",
+ "tempfile",
+ "tikv-jemalloc-ctl",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "jobserver"
 version = "0.1.32"
@@ -3022,9 +3039,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.150"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libloading"
@@ -3044,9 +3061,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "linux-raw-sys"
@@ -3079,6 +3096,19 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "mappings"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+dependencies = [
+ "anyhow",
+ "libc",
+ "once_cell",
+ "pprof_util",
+ "tracing",
+]
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -3346,6 +3376,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
 dependencies = [
+ "num-bigint",
  "num-complex",
  "num-integer",
  "num-iter",
@@ -3434,6 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
 dependencies = [
  "autocfg",
+ "num-bigint",
  "num-integer",
  "num-traits",
 ]
@@ -3497,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "oorandom"
@@ -4298,6 +4330,19 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "pprof_util"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+dependencies = [
+ "anyhow",
+ "flate2",
+ "num",
+ "paste",
+ "prost",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5220,14 +5265,14 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.28"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags 2.4.1",
  "errno",
  "libc",
- "linux-raw-sys 0.4.13",
+ "linux-raw-sys 0.4.14",
  "windows-sys 0.52.0",
 ]
 
@@ -6251,13 +6296,13 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
  "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
+ "fastrand 2.2.0",
+ "once_cell",
  "rustix",
  "windows-sys 0.52.0",
 ]
@@ -7058,6 +7103,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "hyper 0.14.30",
+ "jemalloc_pprof",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
@@ -7644,8 +7690,12 @@ dependencies = [
  "memchr",
  "nix 0.26.4",
  "nom",
+ "num",
  "num-bigint",
+ "num-complex",
  "num-integer",
+ "num-iter",
+ "num-rational",
  "num-traits",
  "once_cell",
  "parquet",
@@ -7669,6 +7719,7 @@ dependencies = [
  "subtle",
  "syn 2.0.90",
  "sync_wrapper 0.1.2",
+ "tikv-jemalloc-ctl",
  "tikv-jemalloc-sys",
  "time",
  "time-macros",
diff --git a/Cargo.toml b/Cargo.toml
index 036dc01057..91fa6a2607 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -115,6 +115,7 @@ indoc = "2"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
+jemalloc_pprof = "0.6"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
@@ -175,7 +176,7 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.3"
 thiserror = "1.0"
-tikv-jemallocator = { version = "0.6", features = ["stats"] }
+tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 5648072a83..66500fb141 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
+jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 6a85f0ddeb..d975b63677 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
@@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     }
 }
 
+/// Generates heap profiles.
+///
+/// This only works with jemalloc on Linux.
+pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    enum Format {
+        Jemalloc,
+        Pprof,
+    }
+
+    // Parameters.
+    let format = match get_query_param(&req, "format")?.as_deref() {
+        None => Format::Pprof,
+        Some("jemalloc") => Format::Jemalloc,
+        Some("pprof") => Format::Pprof,
+        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
+    };
+
+    // Obtain profiler handle.
+    let mut prof_ctl = jemalloc_pprof::PROF_CTL
+        .as_ref()
+        .ok_or(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )))?
+        .lock()
+        .await;
+    if !prof_ctl.activated() {
+        return Err(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )));
+    }
+
+    // Take and return the profile.
+    match format {
+        Format::Jemalloc => {
+            // NB: file is an open handle to a tempfile that's already deleted.
+            let file = tokio::task::spawn_blocking(move || prof_ctl.dump())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            let stream = ReaderStream::new(tokio::fs::File::from_std(file));
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"")
+                .body(Body::wrap_stream(stream))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+
+        Format::Pprof => {
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .body(Body::from(data))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+    }
+}
+
 pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 31f4370855..8fe225c6aa 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ceb1c3b012..e127871549 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::profile_cpu_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
-use utils::http::endpoint::request_span;
+use utils::http::endpoint::{
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+};
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
@@ -155,6 +155,7 @@ impl State {
             "/swagger.yml",
             "/metrics",
             "/profile/cpu",
+            "/profile/heap",
         ];
         Ok(Self {
             conf,
@@ -3203,6 +3204,7 @@ pub fn make_router(
         .data(state)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 8c4281cf52..313d945b94 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -24,9 +24,15 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 3659bcd7e0..4dc7edef37 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,6 +51,11 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 28294abdb9..69b775fd76 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{
-    profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter,
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+    ChannelWriter,
 };
 use utils::http::request::parse_query_param;
 
@@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     let mut router = endpoint::make_router();
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
-            const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"];
+            const ALLOWLIST_ROUTES: &[&str] =
+                &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"];
             if ALLOWLIST_ROUTES.contains(&request.uri().path()) {
                 None
             } else {
@@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(auth)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c0a3abc377..d19379aefd 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -55,12 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nix = { version = "0.26" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
@@ -76,7 +80,8 @@ smallvec = { version = "1", default-features = false, features = ["const_new", "
 spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
-tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
+tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
+tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
@@ -111,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
 proc-macro2 = { version = "1" }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From bbe4dfa99154b679371e0bdfa9d648d1ebdae2ee Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 15:33:31 +0100
Subject: [PATCH 038/117] test_runner: use immediate shutdown in
 `test_sharded_ingest` (#9984)

## Problem

`test_sharded_ingest` ingests a lot of data, which can cause shutdown to
be slow e.g. due to local "S3 uploads" or compactions. This can cause
test flakes during teardown.

Resolves #9740.

## Summary of changes

Perform an immediate shutdown of the cluster.
---
 test_runner/performance/test_sharded_ingest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 4c21e799c8..94fd54bade 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -90,6 +90,7 @@ def test_sharded_ingest(
     # Start the endpoint.
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
     # Ingest data and measure WAL volume and duration.
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
@@ -104,6 +105,8 @@ def test_sharded_ingest(
                 wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    # Record metrics.
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
     zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
 
@@ -152,3 +155,7 @@ def test_sharded_ingest(
     log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
 
     assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
+
+    # The pageservers can take a long time to shut down gracefully, presumably due to the upload
+    # queue or compactions or something. Just stop them immediately, we don't care.
+    env.stop(immediate=True)

From 4d422b937c40722f1f19373ade3fcba976cb96a0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 16:25:58 +0100
Subject: [PATCH 039/117] pageserver: only throttle pagestream requests & bring
 back throttling deduction for smgr latency metrics (#9962)

## Problem

In the batching PR
- https://github.com/neondatabase/neon/pull/9870

I stopped deducting the time-spent-in-throttle fro latency metrics,
i.e.,
- smgr latency metrics (`SmgrOpTimer`)
- basebackup latency (+scan latency, which I think is part of
basebackup).

The reason for stopping the deduction was that with the introduction of
batching, the trick with tracking time-spent-in-throttle inside
RequestContext and swap-replacing it from the `impl Drop for
SmgrOpTimer` no longer worked with >1 requests in a batch.

However, deducting time-spent-in-throttle is desirable because our
internal latency SLO definition does not account for throttling.

## Summary of changes

- Redefine throttling to be a page_service pagestream request throttle
instead of a throttle for repository `Key` reads through `Timeline::get`
/ `Timeline::get_vectored`.
- This means reads done by `basebackup` are no longer subject to any
throttle.
- The throttle applies after batching, before handling of the request.
- Drive-by fix: make throttle sensitive to cancellation.
- Rename metric label `kind` from `timeline_get` to `pagestream` to
reflect the new scope of throttling.

To avoid config format breakage, we leave the config field named
`timeline_get_throttle` and ignore the `task_kinds` field.
This will be cleaned up in a future PR.

## Trade-Offs

Ideally, we would apply the throttle before reading a request off the
connection, so that we queue the minimal amount of work inside the
process.
However, that's not possible because we need to do shard routing.

The redefinition of the throttle to limit pagestream request rate
instead of repository `Key` rate comes with several downsides:
- We're no longer able to use the throttle mechanism for other other
tasks, e.g. image layer creation.
  However, in practice, we never used that capability anyways.
- We no longer throttle basebackup.
---
 libs/pageserver_api/src/models.rs             | 58 ++++++++++-
 pageserver/src/metrics.rs                     | 95 ++++++++++++-------
 pageserver/src/page_service.rs                | 45 ++++++++-
 pageserver/src/tenant.rs                      | 20 ++--
 pageserver/src/tenant/tasks.rs                |  6 +-
 pageserver/src/tenant/throttle.rs             | 33 ++-----
 pageserver/src/tenant/timeline.rs             | 54 ++---------
 pageserver/src/tenant/timeline/delete.rs      |  2 +-
 .../test_pageserver_getpage_throttle.py       | 16 ++--
 9 files changed, 198 insertions(+), 131 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 42c5d10c05..5488f7b2c2 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold {
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
-    pub task_kinds: Vec<String>, // TaskKind
+    /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`.
+    #[serde(rename = "task_kinds")]
+    pub enabled: ThrottleConfigTaskKinds,
     pub initial: u32,
     #[serde(with = "humantime_serde")]
     pub refill_interval: Duration,
@@ -509,10 +511,38 @@ pub struct ThrottleConfig {
     pub max: u32,
 }
 
+/// Before <https://github.com/neondatabase/neon/pull/9962>
+/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call.
+/// The `task_kinds` field controlled which Pageserver "Task Kind"s
+/// were subject to the throttle.
+///
+/// After that PR, the throttle is applied at pagestream request level
+/// and the `task_kinds` field does not apply since the only task kind
+/// that us subject to the throttle is that of the page service.
+///
+/// However, we don't want to make a breaking config change right now
+/// because it means we have to migrate all the tenant configs.
+/// This will be done in a future PR.
+///
+/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds`
+/// field to determine if the throttle is enabled or not.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct ThrottleConfigTaskKinds(Vec<String>);
+
+impl ThrottleConfigTaskKinds {
+    pub fn disabled() -> Self {
+        Self(vec![])
+    }
+    pub fn is_enabled(&self) -> bool {
+        !self.0.is_empty()
+    }
+}
+
 impl ThrottleConfig {
     pub fn disabled() -> Self {
         Self {
-            task_kinds: vec![], // effectively disables the throttle
+            enabled: ThrottleConfigTaskKinds::disabled(),
             // other values don't matter with emtpy `task_kinds`.
             initial: 0,
             refill_interval: Duration::from_millis(1),
@@ -526,6 +556,30 @@ impl ThrottleConfig {
     }
 }
 
+#[cfg(test)]
+mod throttle_config_tests {
+    use super::*;
+
+    #[test]
+    fn test_disabled_is_disabled() {
+        let config = ThrottleConfig::disabled();
+        assert!(!config.enabled.is_enabled());
+    }
+    #[test]
+    fn test_enabled_backwards_compat() {
+        let input = serde_json::json!({
+            "task_kinds": ["PageRequestHandler"],
+            "initial": 40000,
+            "refill_interval": "50ms",
+            "refill_amount": 1000,
+            "max": 40000,
+            "fair": true
+        });
+        let config: ThrottleConfig = serde_json::from_value(input).unwrap();
+        assert!(config.enabled.is_enabled());
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d04fae7627..998c15ccaf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -217,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> {
         ScanLatencyOngoingRecording { parent, start }
     }
 
-    pub(crate) fn observe(self, throttled: Option<Duration>) {
+    pub(crate) fn observe(self) {
         let elapsed = self.start.elapsed();
-        let ex_throttled = if let Some(throttled) = throttled {
-            elapsed.checked_sub(throttled)
-        } else {
-            Some(elapsed)
-        };
-        if let Some(ex_throttled) = ex_throttled {
-            self.parent.observe(ex_throttled.as_secs_f64());
-        } else {
-            use utils::rate_limit::RateLimit;
-            static LOGGED: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-            let mut rate_limit = LOGGED.lock().unwrap();
-            rate_limit.call(|| {
-                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-            });
-        }
+        self.parent.observe(elapsed.as_secs_f64());
     }
 }
 
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
+        "Time spent in get_vectored.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -264,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
 pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_scan_seconds",
-        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        "Time spent in scan.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -1227,11 +1212,44 @@ pub(crate) struct SmgrOpTimer {
     per_timeline_latency_histo: Option<Histogram>,
 
     start: Instant,
+    throttled: Duration,
+    op: SmgrQueryType,
+}
+
+impl SmgrOpTimer {
+    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
+        let Some(throttle) = throttle else {
+            return;
+        };
+        self.throttled += *throttle;
+    }
 }
 
 impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed().as_secs_f64();
+        let elapsed = self.start.elapsed();
+
+        let elapsed = match elapsed.checked_sub(self.throttled) {
+            Some(elapsed) => elapsed,
+            None => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                });
+                elapsed // un-throttled time, more info than just saturating to 0
+            }
+        };
+
+        let elapsed = elapsed.as_secs_f64();
+
         self.global_latency_histo.observe(elapsed);
         if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
@@ -1491,6 +1509,8 @@ impl SmgrQueryTimePerTimeline {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
+            op,
+            throttled: Duration::ZERO,
         }
     }
 
@@ -3299,7 +3319,7 @@ pub(crate) mod tenant_throttling {
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
-    use crate::tenant::{self, throttle::Metric};
+    use crate::tenant::{self};
 
     struct GlobalAndPerTenantIntCounter {
         global: IntCounter,
@@ -3318,7 +3338,7 @@ pub(crate) mod tenant_throttling {
         }
     }
 
-    pub(crate) struct TimelineGet {
+    pub(crate) struct Metrics<const KIND: usize> {
         count_accounted_start: GlobalAndPerTenantIntCounter,
         count_accounted_finish: GlobalAndPerTenantIntCounter,
         wait_time: GlobalAndPerTenantIntCounter,
@@ -3391,40 +3411,41 @@ pub(crate) mod tenant_throttling {
         .unwrap()
     });
 
-    const KIND: &str = "timeline_get";
+    const KINDS: &[&str] = &["pagestream"];
+    pub type Pagestream = Metrics<0>;
 
-    impl TimelineGet {
+    impl<const KIND: usize> Metrics<KIND> {
         pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
             let per_tenant_label_values = &[
-                KIND,
+                KINDS[KIND],
                 &tenant_shard_id.tenant_id.to_string(),
                 &tenant_shard_id.shard_slug().to_string(),
             ];
-            TimelineGet {
+            Metrics {
                 count_accounted_start: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_accounted_finish: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 wait_time: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_USECS_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_throttled: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_COUNT_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
@@ -3447,15 +3468,17 @@ pub(crate) mod tenant_throttling {
             &WAIT_USECS_PER_TENANT,
             &WAIT_COUNT_PER_TENANT,
         ] {
-            let _ = m.remove_label_values(&[
-                KIND,
-                &tenant_shard_id.tenant_id.to_string(),
-                &tenant_shard_id.shard_slug().to_string(),
-            ]);
+            for kind in KINDS {
+                let _ = m.remove_label_values(&[
+                    kind,
+                    &tenant_shard_id.tenant_id.to_string(),
+                    &tenant_shard_id.shard_slug().to_string(),
+                ]);
+            }
         }
     }
 
-    impl Metric for TimelineGet {
+    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
         #[inline(always)]
         fn accounting_start(&self) {
             self.count_accounted_start.inc();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 64842aa5b8..7026df9527 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -574,6 +574,41 @@ enum BatchedFeMessage {
     },
 }
 
+impl BatchedFeMessage {
+    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+        let (shard, tokens, timers) = match self {
+            BatchedFeMessage::Exists { shard, timer, .. }
+            | BatchedFeMessage::Nblocks { shard, timer, .. }
+            | BatchedFeMessage::DbSize { shard, timer, .. }
+            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
+                (
+                    shard,
+                    // 1 token is probably under-estimating because these
+                    // request handlers typically do several Timeline::get calls.
+                    1,
+                    itertools::Either::Left(std::iter::once(timer)),
+                )
+            }
+            BatchedFeMessage::GetPage { shard, pages, .. } => (
+                shard,
+                pages.len(),
+                itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
+            ),
+            BatchedFeMessage::RespondError { .. } => return Ok(()),
+        };
+        let throttled = tokio::select! {
+            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown);
+            }
+        };
+        for timer in timers {
+            timer.deduct_throttle(&throttled);
+        }
+        Ok(())
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
@@ -1157,13 +1192,18 @@ impl PageServerHandler {
                 Ok(msg) => msg,
                 Err(e) => break e,
             };
-            let msg = match msg {
+            let mut msg = match msg {
                 Some(msg) => msg,
                 None => {
                     debug!("pagestream subprotocol end observed");
                     return ((pgb_reader, timeline_handles), Ok(()));
                 }
             };
+
+            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+                break cancelled;
+            }
+
             let err = self
                 .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
                 .await;
@@ -1321,12 +1361,13 @@ impl PageServerHandler {
                             return Ok(());
                         }
                     };
-                    let batch = match batch {
+                    let mut batch = match batch {
                         Ok(batch) => batch,
                         Err(e) => {
                             return Err(e);
                         }
                     };
+                    batch.throttle(&self.cancel).await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cd0690bb1a..ada5c4a977 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -357,8 +357,8 @@ pub struct Tenant {
 
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub(crate) pagestream_throttle:
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -1678,7 +1678,7 @@ impl Tenant {
                     remote_metadata,
                     TimelineResources {
                         remote_client,
-                        timeline_get_throttle: self.timeline_get_throttle.clone(),
+                        pagestream_throttle: self.pagestream_throttle.clone(),
                         l0_flush_global_state: self.l0_flush_global_state.clone(),
                     },
                     LoadTimelineCause::Attach,
@@ -3835,7 +3835,7 @@ impl Tenant {
         }
     }
 
-    fn get_timeline_get_throttle_config(
+    fn get_pagestream_throttle_config(
         psconf: &'static PageServerConf,
         overrides: &TenantConfOpt,
     ) -> throttle::Config {
@@ -3846,8 +3846,8 @@ impl Tenant {
     }
 
     pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
-        self.timeline_get_throttle.reconfigure(conf)
+        let conf = Self::get_pagestream_throttle_config(self.conf, new_conf);
+        self.pagestream_throttle.reconfigure(conf)
     }
 
     /// Helper function to create a new Timeline struct.
@@ -4009,9 +4009,9 @@ impl Tenant {
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
-            timeline_get_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
+            pagestream_throttle: Arc::new(throttle::Throttle::new(
+                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -4909,7 +4909,7 @@ impl Tenant {
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
         TimelineResources {
             remote_client: self.build_timeline_remote_client(timeline_id),
-            timeline_get_throttle: self.timeline_get_throttle.clone(),
+            pagestream_throttle: self.pagestream_throttle.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 16dac10dca..0118a5ce5f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
             // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+            info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
                 let now = Instant::now();
                 let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
                 if count_throttled == 0 {
                     return;
                 }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let allowed_rps = tenant.pagestream_throttle.steady_rps();
                 let delta = now - prev;
                 info!(
                     n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 7c4de55a47..54c0e59daa 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -1,5 +1,4 @@
 use std::{
-    str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
         Arc,
@@ -8,12 +7,8 @@ use std::{
 };
 
 use arc_swap::ArcSwap;
-use enumset::EnumSet;
-use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
-use crate::{context::RequestContext, task_mgr::TaskKind};
-
 /// Throttle for `async` functions.
 ///
 /// Runtime reconfigurable.
@@ -35,7 +30,7 @@ pub struct Throttle<M: Metric> {
 }
 
 pub struct Inner {
-    task_kinds: EnumSet<TaskKind>,
+    enabled: bool,
     rate_limiter: Arc<RateLimiter>,
 }
 
@@ -79,26 +74,12 @@ where
     }
     fn new_inner(config: Config) -> Inner {
         let Config {
-            task_kinds,
+            enabled,
             initial,
             refill_interval,
             refill_amount,
             max,
         } = config;
-        let task_kinds: EnumSet<TaskKind> = task_kinds
-            .iter()
-            .filter_map(|s| match TaskKind::from_str(s) {
-                Ok(v) => Some(v),
-                Err(e) => {
-                    // TODO: avoid this failure mode
-                    error!(
-                        "cannot parse task kind, ignoring for rate limiting {}",
-                        utils::error::report_compact_sources(&e)
-                    );
-                    None
-                }
-            })
-            .collect();
 
         // steady rate, we expect `refill_amount` requests per `refill_interval`.
         // dividing gives us the rps.
@@ -112,7 +93,7 @@ where
         let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
 
         Inner {
-            task_kinds,
+            enabled: enabled.is_enabled(),
             rate_limiter: Arc::new(rate_limiter),
         }
     }
@@ -141,11 +122,13 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
-        if !inner.task_kinds.contains(ctx.task_kind()) {
+
+        if !inner.enabled {
             return None;
-        };
+        }
+
         let start = std::time::Instant::now();
 
         self.metric.accounting_start();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc3f823f20..1414bef0a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -411,9 +411,9 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
 
-    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    pub(crate) pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -949,7 +949,7 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
-    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    /// This method enforces [`Self::pagestream_throttle`] internally.
     ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
@@ -977,8 +977,6 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         let keyspace = KeySpace {
             ranges: vec![key..key.next()],
         };
@@ -1058,14 +1056,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(|metric| (metric, Instant::now()));
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing `ex_throttled` below.
-        let throttled = self
-            .timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         let res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1077,23 +1067,7 @@ impl Timeline {
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
-            let ex_throttled = if let Some(throttled) = throttled {
-                elapsed.checked_sub(throttled)
-            } else {
-                Some(elapsed)
-            };
-
-            if let Some(ex_throttled) = ex_throttled {
-                metric.observe(ex_throttled.as_secs_f64());
-            } else {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-            }
+            metric.observe(elapsed.as_secs_f64());
         }
 
         res
@@ -1138,16 +1112,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(ScanLatencyOngoingRecording::start_recording);
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing the `ex_throttled` value in
-        // `recording.observe(throttled)` below.
-        let throttled = self
-            .timeline_get_throttle
-            // assume scan = 1 quota for now until we find a better way to process this
-            .throttle(ctx, 1)
-            .await;
-
         let vectored_res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1158,7 +1122,7 @@ impl Timeline {
             .await;
 
         if let Some(recording) = start {
-            recording.observe(throttled);
+            recording.observe();
         }
 
         vectored_res
@@ -2374,7 +2338,7 @@ impl Timeline {
 
                 standby_horizon: AtomicLsn::new(0),
 
-                timeline_get_throttle: resources.timeline_get_throttle,
+                pagestream_throttle: resources.pagestream_throttle,
 
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 67fc710c44..47a93b19d2 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -298,7 +298,7 @@ impl DeleteTimelineFlow {
                 None, // Ancestor is not needed for deletion.
                 TimelineResources {
                     remote_client,
-                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    pagestream_throttle: tenant.pagestream_throttle.clone(),
                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 62aec50a9e..6d0661f068 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -33,7 +33,9 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         conf={
             "compaction_period": f"{compaction_period}s",
             "timeline_get_throttle": {
-                "task_kinds": ["PageRequestHandler"],
+                "task_kinds": [
+                    "PageRequestHandler"
+                ],  # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962
                 "initial": 0,
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
@@ -116,7 +118,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
-    log.info("the smgr metric includes throttle time")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -125,13 +126,14 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
+    log.info("validate that the metric doesn't include throttle wait time")
     assert (
-        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
-    ), "smgr metrics include throttle wait time"
-    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
-    assert smgr_ex_throttle > 0
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"
+
+    log.info("validate that the throttling wait time metrics is correct")
     assert (
-        duration_secs > 10 * smgr_ex_throttle
+        pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs
     ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 

From 71d004289c0e9b62a3be96939a8b5defa8f98065 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 16:55:00 +0000
Subject: [PATCH 040/117] storcon: in shard splits, inherit parent's AZ (#9946)

## Problem

Sharded tenants should be run in a single AZ for best performance, so
that computes have AZ-local latency to all the shards.

Part of https://github.com/neondatabase/neon/issues/8264

## Summary of changes

- When we split a tenant, instead of updating each shard's preferred AZ
to wherever it is scheduled, propagate the preferred AZ from the parent.
- Drop the check in `test_shard_preferred_azs` that asserts shards end
up in their preferred AZ: this will not be true again until the
optimize_attachment logic is updated to make this so. The existing check
wasn't testing anything about scheduling, it was just asserting that we
set preferred AZ in a way that matches the way things happen to be
scheduled at time of split.
---
 storage_controller/src/service.rs             | 68 ++++++-------------
 .../regress/test_storage_controller.py        |  6 +-
 2 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 52c9c4710d..741d3dc2b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
+        NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
+        ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
+        TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -468,6 +468,7 @@ struct ShardSplitParams {
     policy: PlacementPolicy,
     config: TenantConfig,
     shard_ident: ShardIdentity,
+    preferred_az_id: Option<AvailabilityZone>,
 }
 
 // When preparing for a shard split, we may either choose to proceed with the split,
@@ -4103,7 +4104,7 @@ impl Service {
             for parent_id in parent_ids {
                 let child_ids = parent_id.split(new_shard_count);
 
-                let (pageserver, generation, policy, parent_ident, config) = {
+                let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
                     let mut old_state = tenants
                         .remove(&parent_id)
                         .expect("It was present, we just split it");
@@ -4122,6 +4123,7 @@ impl Service {
                         old_state.policy.clone(),
                         old_state.shard,
                         old_state.config.clone(),
+                        old_state.preferred_az().cloned(),
                     )
                 };
 
@@ -4154,6 +4156,9 @@ impl Service {
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
+                    if let Some(preferred_az) = &preferred_az {
+                        child_state.set_preferred_az(preferred_az.clone());
+                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -4346,6 +4351,7 @@ impl Service {
         let mut policy = None;
         let mut config = None;
         let mut shard_ident = None;
+        let mut preferred_az_id = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -4404,6 +4410,9 @@ impl Service {
                     if config.is_none() {
                         config = Some(shard.config.clone());
                     }
+                    if preferred_az_id.is_none() {
+                        preferred_az_id = shard.preferred_az().cloned();
+                    }
 
                     if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                         tracing::info!(
@@ -4474,6 +4483,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         })))
     }
 
@@ -4496,6 +4506,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
@@ -4569,7 +4580,7 @@ impl Service {
                     // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
-                    preferred_az_id: None,
+                    preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()),
                 });
             }
 
@@ -4689,47 +4700,6 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
-        // Now that we have scheduled the child shards, attempt to set their preferred AZ
-        // to that of the pageserver they've been attached on.
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            child_locations
-                .iter()
-                .filter_map(|(tid, node_id, _stripe_size)| {
-                    let az_id = locked
-                        .nodes
-                        .get(node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((*tid, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            });
-
-        match updated {
-            Ok(updated) => {
-                let mut locked = self.inner.write().unwrap();
-                for (tid, az_id) in updated {
-                    if let Some(shard) = locked.tenants.get_mut(&tid) {
-                        shard.set_preferred_az(az_id);
-                    }
-                }
-            }
-            Err(err) => {
-                tracing::warn!("Failed to persist preferred AZs after split: {err}");
-            }
-        }
-
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 244893a616..f878116d53 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3057,7 +3057,11 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     for shard in shards:
         attached_to = shard["node_attached"]
         expected_az = env.get_pageserver(attached_to).az_id
-        assert shard["preferred_az_id"] == expected_az
+
+        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
+        # in putting the tenant shards in the preferred AZ.
+        # To be fixed in https://github.com/neondatabase/neon/pull/9916
+        # assert shard["preferred_az_id"] == expected_az
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")

From dcb629532b075a68ba6a2aeeb3933e8ac73efbb9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 17:22:49 +0000
Subject: [PATCH 041/117] pageserver: only store SLRUs & aux files on shard
 zero (#9786)

## Problem

Since https://github.com/neondatabase/neon/pull/9423 the non-zero shards
no longer need SLRU content in order to do GC. This data is now
redundant on shards >0.

One release cycle after merging that PR, we may merge this one, which
also stops writing those pages to shards > 0, reaping the efficiency
benefit.

Closes: https://github.com/neondatabase/neon/issues/7512
Closes: https://github.com/neondatabase/neon/issues/9641

## Summary of changes

- Avoid storing SLRUs on non-zero shards
- Bonus: avoid storing aux files on non-zero shards
---
 libs/pageserver_api/src/key.rs                |  5 ++
 libs/pageserver_api/src/shard.rs              | 34 +++++++++---
 libs/wal_decoder/src/decoder.rs               | 54 ++++++++++--------
 pageserver/src/import_datadir.rs              | 18 ++++--
 pageserver/src/pgdatadir_mapping.rs           | 55 ++++++++++++-------
 .../src/tenant/timeline/import_pgdata/flow.rs | 49 +++++++----------
 pageserver/src/walingest.rs                   |  4 ++
 7 files changed, 134 insertions(+), 85 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 523d143381..37dff6fe46 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -770,6 +770,11 @@ impl Key {
             && self.field6 == 1
     }
 
+    #[inline(always)]
+    pub fn is_aux_file_key(&self) -> bool {
+        self.field1 == AUX_KEY_PREFIX
+    }
+
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
     pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e83cf4c855..a5c94a82c1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -170,19 +170,37 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be stored on all shards, not just one.
+    fn is_key_global(&self, key: &Key) -> bool {
+        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
+            // Special keys that are only stored on shard 0
+            false
+        } else if key.is_rel_block_key() {
+            // Ordinary relation blocks are distributed across shards
+            false
+        } else if key.is_rel_size_key() {
+            // All shards maintain rel size keys (although only shard 0 is responsible for
+            // keeping it strictly accurate, other shards just reflect the highest block they've ingested)
+            true
+        } else {
+            // For everything else, we assume it must be kept everywhere, because ingest code
+            // might assume this -- this covers functionality where the ingest code has
+            // not (yet) been made fully shard aware.
+            true
+        }
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
     /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A1: because the WAL ingestion logic currently ingests some shard 0
-            //     content on all shards, even though it's only read on shard 0.  If we
-            //     dropped it, then subsequent WAL ingest to these keys would encounter
-            //     an error.
-            // A2: because key_is_shard0 also covers relation size keys, which are written
-            //     on all shards even though they're only maintained accurately on shard 0.
+        if self.count < ShardCount(2) {
+            // Fast path: unsharded tenant doesn't dispose of anything
+            return false;
+        }
+
+        if self.is_key_global(key) {
             false
         } else {
             !self.is_key_local(key)
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 36c4b19266..aa50c62911 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -112,30 +112,38 @@ impl MetadataRecord {
         };
 
         // Next, filter the metadata record by shard.
-
-        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-        // of the main relation. These are sharded and managed just like regular relation pages.
-        // See: https://github.com/neondatabase/neon/issues/9855
-        if let Some(
-            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-        ) = metadata_record
-        {
-            let is_local_vm_page = |heap_blk| {
-                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-            };
-            // Send the old and new VM page updates to their respective shards.
-            clear_vm_bits.old_heap_blkno = clear_vm_bits
-                .old_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            clear_vm_bits.new_heap_blkno = clear_vm_bits
-                .new_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            // If neither VM page belongs to this shard, discard the record.
-            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
-                metadata_record = None
+        match metadata_record {
+            Some(
+                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+            ) => {
+                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                // of the main relation. These are sharded and managed just like regular relation pages.
+                // See: https://github.com/neondatabase/neon/issues/9855
+                let is_local_vm_page = |heap_blk| {
+                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                };
+                // Send the old and new VM page updates to their respective shards.
+                clear_vm_bits.old_heap_blkno = clear_vm_bits
+                    .old_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                clear_vm_bits.new_heap_blkno = clear_vm_bits
+                    .new_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                // If neither VM page belongs to this shard, discard the record.
+                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
+                {
+                    metadata_record = None
+                }
             }
+            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                if !shard.is_shard_zero() {
+                    metadata_record = None;
+                }
+            }
+            _ => {}
         }
 
         Ok(metadata_record)
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 06c4553e1c..c061714010 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -575,18 +575,24 @@ async fn import_file(
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported clog slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported clog slru");
+        }
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact offsets slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact offsets slru");
+        }
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact members slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact members slru");
+        }
     } else if file_path.starts_with("pg_twophase") {
         let bytes = read_all_bytes(reader).await?;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a00ec761e2..255bd01e25 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -530,6 +530,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let n_blocks = self
             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
             .await?;
@@ -552,6 +553,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn, ctx).await
     }
@@ -564,6 +566,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_segment_size_to_key(kind, segno);
         let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
@@ -577,6 +580,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         // fetch directory listing
         let key = slru_dir_to_key(kind);
         let buf = version.get(self, key, ctx).await?;
@@ -1047,26 +1051,28 @@ impl Timeline {
         }
 
         // Iterate SLRUs next
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactMembers,
-            SlruKind::MultiXactOffsets,
-        ] {
-            let slrudir_key = slru_dir_to_key(kind);
-            result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
-            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
-            segments.sort_unstable();
-            for segno in segments {
-                let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn, ctx).await?;
-                let segsize = buf.get_u32_le();
+        if self.tenant_shard_id.is_shard_zero() {
+            for kind in [
+                SlruKind::Clog,
+                SlruKind::MultiXactMembers,
+                SlruKind::MultiXactOffsets,
+            ] {
+                let slrudir_key = slru_dir_to_key(kind);
+                result.add_key(slrudir_key);
+                let buf = self.get(slrudir_key, lsn, ctx).await?;
+                let dir = SlruSegmentDirectory::des(&buf)?;
+                let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
+                segments.sort_unstable();
+                for segno in segments {
+                    let segsize_key = slru_segment_size_to_key(kind, segno);
+                    let mut buf = self.get(segsize_key, lsn, ctx).await?;
+                    let segsize = buf.get_u32_le();
 
-                result.add_range(
-                    slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
-                );
-                result.add_key(segsize_key);
+                    result.add_range(
+                        slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
+                    );
+                    result.add_key(segsize_key);
+                }
             }
         }
 
@@ -1468,6 +1474,10 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> anyhow::Result<()> {
+        if !self.tline.tenant_shard_id.is_shard_zero() {
+            return Ok(());
+        }
+
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -1501,6 +1511,8 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1542,6 +1554,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1853,6 +1866,8 @@ impl<'a> DatadirModification<'a> {
         nblocks: BlockNumber,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -1885,6 +1900,8 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         nblocks: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index cbd4168c06..4388072606 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -129,22 +129,23 @@ impl Flow {
         }
 
         // Import SLRUs
-
-        // pg_xact (01:00 keyspace)
-        self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+        if self.timeline.tenant_shard_id.is_shard_zero() {
+            // pg_xact (01:00 keyspace)
+            self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+                .await?;
+            // pg_multixact/members (01:01 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactMembers,
+                &self.storage.pgdata().join("pg_multixact/members"),
+            )
             .await?;
-        // pg_multixact/members (01:01 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactMembers,
-            &self.storage.pgdata().join("pg_multixact/members"),
-        )
-        .await?;
-        // pg_multixact/offsets (01:02 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactOffsets,
-            &self.storage.pgdata().join("pg_multixact/offsets"),
-        )
-        .await?;
+            // pg_multixact/offsets (01:02 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactOffsets,
+                &self.storage.pgdata().join("pg_multixact/offsets"),
+            )
+            .await?;
+        }
 
         // Import pg_twophase.
         // TODO: as empty
@@ -302,6 +303,8 @@ impl Flow {
     }
 
     async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
+        assert!(self.timeline.tenant_shard_id.is_shard_zero());
+
         let segments = self.storage.listfilesindir(path).await?;
         let segments: Vec<(String, u32, usize)> = segments
             .into_iter()
@@ -337,7 +340,6 @@ impl Flow {
             debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
             self.tasks
                 .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
-                    *self.timeline.get_shard_identity(),
                     start_key..end_key,
                     &p,
                     self.storage.clone(),
@@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask {
 }
 
 struct ImportSlruBlocksTask {
-    shard_identity: ShardIdentity,
     key_range: Range<Key>,
     path: RemotePath,
     storage: RemoteStorageWrapper,
 }
 
 impl ImportSlruBlocksTask {
-    fn new(
-        shard_identity: ShardIdentity,
-        key_range: Range<Key>,
-        path: &RemotePath,
-        storage: RemoteStorageWrapper,
-    ) -> Self {
+    fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
         ImportSlruBlocksTask {
-            shard_identity,
             key_range,
             path: path.clone(),
             storage,
@@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask {
         let mut file_offset = 0;
         while blknum < end_blk {
             let key = slru_block_to_key(kind, segno, blknum);
-            assert!(
-                !self.shard_identity.is_key_disposable(&key),
-                "SLRU keys need to go into every shard"
-            );
             let buf = &buf[file_offset..(file_offset + 8192)];
             file_offset += 8192;
             layer_writer
                 .put_image(key, Bytes::copy_from_slice(buf), ctx)
                 .await?;
-            blknum += 1;
             nimages += 1;
+            blknum += 1;
         }
         Ok(nimages)
     }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d568da596a..93ae88936f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1392,6 +1392,10 @@ impl WalIngest {
         img: Bytes,
         ctx: &RequestContext,
     ) -> Result<()> {
+        if !self.shard.is_shard_zero() {
+            return Ok(());
+        }
+
         self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;

From b04ab468ee830676fe431975a89b1ce3ec781bac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 18:36:37 +0000
Subject: [PATCH 042/117] pageserver: more detailed logs when calling re-attach
 (#9996)

## Problem

We saw a peculiar case where a pageserver apparently got a 0-tenant
response to `/re-attach` but we couldn't see the request landing on a
storage controller. It was hard to confirm retrospectively that the
pageserver was configured properly at the moment it sent the request.

## Summary of changes

- Log the URL to which we are sending the request
- Log the NodeId and metadata that we sent
---
 libs/pageserver_api/src/controller_api.rs  |  4 ++--
 pageserver/src/controller_upcall_client.rs | 12 +++++++++---
 pageserver/src/tenant/mgr.rs               |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 0ea30ce54f..9a5ebc95bd 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -48,7 +48,7 @@ pub struct TenantCreateResponse {
     pub shards: Vec<TenantCreateResponseShard>,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct NodeRegisterRequest {
     pub node_id: NodeId,
 
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
 pub struct AvailabilityZone(pub String);
 
 impl Display for AvailabilityZone {
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 73fc6dc3ab..d41bfd9021 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -115,6 +115,10 @@ impl ControllerUpcallClient {
 
         Ok(res)
     }
+
+    pub(crate) fn base_url(&self) -> &Url {
+        &self.base_url
+    }
 }
 
 impl ControlPlaneGenerationsApi for ControllerUpcallClient {
@@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
 
         let request = ReAttachRequest {
             node_id: self.node_id,
-            register,
+            register: register.clone(),
         };
 
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
-            "Received re-attach response with {} tenants",
-            response.tenants.len()
+            "Received re-attach response with {} tenants (node {}, register: {:?})",
+            response.tenants.len(),
+            self.node_id,
+            register,
         );
 
         failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index eb8191e43e..45481c4ed4 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -347,7 +347,7 @@ async fn init_load_generations(
         );
         emergency_generations(tenant_confs)
     } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
-        info!("Calling control plane API to re-attach tenants");
+        info!("Calling {} API to re-attach tenants", client.base_url());
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
             Ok(tenants) => tenants

From 27a42d0f960c29b505b972841e0d79c1eab138fb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 18:39:23 +0000
Subject: [PATCH 043/117] chore(proxy): remove postgres config parser and md5
 support (#9990)

Keeping the `mock` postgres cplane adaptor using "stock" tokio-postgres
allows us to remove a lot of dead weight from our actual postgres
connection logic.
---
 Cargo.lock                                    |   2 +-
 libs/proxy/postgres-protocol2/Cargo.toml      |   1 -
 .../src/authentication/mod.rs                 |  35 --
 .../postgres-protocol2/src/message/backend.rs |   8 +-
 .../postgres-protocol2/src/password/mod.rs    |  18 -
 .../postgres-protocol2/src/password/test.rs   |   8 -
 libs/proxy/tokio-postgres2/src/config.rs      | 465 +-----------------
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  19 +-
 libs/proxy/tokio-postgres2/src/error/mod.rs   |   6 -
 libs/proxy/tokio-postgres2/src/lib.rs         |  20 -
 proxy/Cargo.toml                              |   6 +-
 proxy/src/auth/backend/classic.rs             |   2 +-
 proxy/src/auth/backend/console_redirect.rs    |   2 +-
 proxy/src/auth/backend/mod.rs                 |   2 +-
 proxy/src/auth/flow.rs                        |   2 +-
 proxy/src/cancellation.rs                     |   6 +-
 proxy/src/compute.rs                          |  26 +-
 proxy/src/control_plane/client/mock.rs        |   3 +-
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/error.rs                            |   2 +-
 proxy/src/postgres_rustls/mod.rs              |   6 +-
 proxy/src/proxy/retry.rs                      |  16 +-
 proxy/src/proxy/tests/mitm.rs                 |  22 +-
 proxy/src/proxy/tests/mod.rs                  |  20 +-
 proxy/src/serverless/backend.rs               |  18 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/serverless/conn_pool_lib.rs         |   4 +-
 proxy/src/serverless/json.rs                  |   6 +-
 proxy/src/serverless/local_conn_pool.rs       |  10 +-
 proxy/src/serverless/sql_over_http.rs         |  18 +-
 30 files changed, 96 insertions(+), 665 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2769e59f0..5b80ec5e93 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4209,7 +4209,6 @@ dependencies = [
  "bytes",
  "fallible-iterator",
  "hmac",
- "md-5",
  "memchr",
  "rand 0.8.5",
  "sha2",
@@ -4612,6 +4611,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite",
diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
index 284a632954..f71c1599c7 100644
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -10,7 +10,6 @@ byteorder.workspace = true
 bytes.workspace = true
 fallible-iterator.workspace = true
 hmac.workspace = true
-md-5 = "0.10"
 memchr = "2.0"
 rand.workspace = true
 sha2.workspace = true
diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
index 71afa4b9b6..0bdc177143 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
@@ -1,37 +1,2 @@
 //! Authentication protocol support.
-use md5::{Digest, Md5};
-
 pub mod sasl;
-
-/// Hashes authentication information in a way suitable for use in response
-/// to an `AuthenticationMd5Password` message.
-///
-/// The resulting string should be sent back to the database in a
-/// `PasswordMessage` message.
-#[inline]
-pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
-    let mut md5 = Md5::new();
-    md5.update(password);
-    md5.update(username);
-    let output = md5.finalize_reset();
-    md5.update(format!("{:x}", output));
-    md5.update(salt);
-    format!("md5{:x}", md5.finalize())
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn md5() {
-        let username = b"md5_user";
-        let password = b"password";
-        let salt = [0x2a, 0x3d, 0x8f, 0xe0];
-
-        assert_eq!(
-            md5_hash(username, password, salt),
-            "md562af4dd09bbb41884907a838a3233294"
-        );
-    }
-}
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 33d77fc252..097964f9c1 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -79,7 +79,7 @@ pub enum Message {
     AuthenticationCleartextPassword,
     AuthenticationGss,
     AuthenticationKerberosV5,
-    AuthenticationMd5Password(AuthenticationMd5PasswordBody),
+    AuthenticationMd5Password,
     AuthenticationOk,
     AuthenticationScmCredential,
     AuthenticationSspi,
@@ -191,11 +191,7 @@ impl Message {
                 0 => Message::AuthenticationOk,
                 2 => Message::AuthenticationKerberosV5,
                 3 => Message::AuthenticationCleartextPassword,
-                5 => {
-                    let mut salt = [0; 4];
-                    buf.read_exact(&mut salt)?;
-                    Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
-                }
+                5 => Message::AuthenticationMd5Password,
                 6 => Message::AuthenticationScmCredential,
                 7 => Message::AuthenticationGss,
                 8 => Message::AuthenticationGssContinue,
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
index e669e80f3f..38eb31dfcf 100644
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -8,7 +8,6 @@
 
 use crate::authentication::sasl;
 use hmac::{Hmac, Mac};
-use md5::Md5;
 use rand::RngCore;
 use sha2::digest::FixedOutput;
 use sha2::{Digest, Sha256};
@@ -88,20 +87,3 @@ pub(crate) async fn scram_sha_256_salt(
         base64::encode(server_key)
     )
 }
-
-/// **Not recommended, as MD5 is not considered to be secure.**
-///
-/// Hash password using MD5 with the username as the salt.
-///
-/// The client may assume the returned string doesn't contain any
-/// special characters that would require escaping.
-pub fn md5(password: &[u8], username: &str) -> String {
-    // salt password with username
-    let mut salted_password = Vec::from(password);
-    salted_password.extend_from_slice(username.as_bytes());
-
-    let mut hash = Md5::new();
-    hash.update(&salted_password);
-    let digest = hash.finalize();
-    format!("md5{:x}", digest)
-}
diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs
index c9d340f09d..0692c07adb 100644
--- a/libs/proxy/postgres-protocol2/src/password/test.rs
+++ b/libs/proxy/postgres-protocol2/src/password/test.rs
@@ -9,11 +9,3 @@ async fn test_encrypt_scram_sha_256() {
         "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
     );
 }
-
-#[test]
-fn test_encrypt_md5() {
-    assert_eq!(
-        password::md5(b"secret", "foo"),
-        "md54ab2c5d00339c4b2a4e921d2dc4edec7"
-    );
-}
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 26124b38ef..5dad835c3b 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,11 +6,9 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
-use std::borrow::Cow;
+use std::fmt;
 use std::str;
-use std::str::FromStr;
 use std::time::Duration;
-use std::{error, fmt, iter, mem};
 use tokio::io::{AsyncRead, AsyncWrite};
 
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
@@ -380,99 +378,6 @@ impl Config {
         self.max_backend_message_size
     }
 
-    fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
-        match key {
-            "user" => {
-                self.user(value);
-            }
-            "password" => {
-                self.password(value);
-            }
-            "dbname" => {
-                self.dbname(value);
-            }
-            "options" => {
-                self.options(value);
-            }
-            "application_name" => {
-                self.application_name(value);
-            }
-            "sslmode" => {
-                let mode = match value {
-                    "disable" => SslMode::Disable,
-                    "prefer" => SslMode::Prefer,
-                    "require" => SslMode::Require,
-                    _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
-                };
-                self.ssl_mode(mode);
-            }
-            "host" => {
-                for host in value.split(',') {
-                    self.host(host);
-                }
-            }
-            "port" => {
-                for port in value.split(',') {
-                    let port = if port.is_empty() {
-                        5432
-                    } else {
-                        port.parse()
-                            .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
-                    };
-                    self.port(port);
-                }
-            }
-            "connect_timeout" => {
-                let timeout = value
-                    .parse::<i64>()
-                    .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
-                if timeout > 0 {
-                    self.connect_timeout(Duration::from_secs(timeout as u64));
-                }
-            }
-            "target_session_attrs" => {
-                let target_session_attrs = match value {
-                    "any" => TargetSessionAttrs::Any,
-                    "read-write" => TargetSessionAttrs::ReadWrite,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "target_session_attrs",
-                        ))));
-                    }
-                };
-                self.target_session_attrs(target_session_attrs);
-            }
-            "channel_binding" => {
-                let channel_binding = match value {
-                    "disable" => ChannelBinding::Disable,
-                    "prefer" => ChannelBinding::Prefer,
-                    "require" => ChannelBinding::Require,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "channel_binding",
-                        ))))
-                    }
-                };
-                self.channel_binding(channel_binding);
-            }
-            "max_backend_message_size" => {
-                let limit = value.parse::<usize>().map_err(|_| {
-                    Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
-                })?;
-                if limit > 0 {
-                    self.max_backend_message_size(limit);
-                }
-            }
-            key => {
-                return Err(Error::config_parse(Box::new(UnknownOption(
-                    key.to_string(),
-                ))));
-            }
-        }
-
-        Ok(())
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -499,17 +404,6 @@ impl Config {
     }
 }
 
-impl FromStr for Config {
-    type Err = Error;
-
-    fn from_str(s: &str) -> Result<Config, Error> {
-        match UrlParser::parse(s)? {
-            Some(config) => Ok(config),
-            None => Parser::parse(s),
-        }
-    }
-}
-
 // Omit password from debug output
 impl fmt::Debug for Config {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -536,360 +430,3 @@ impl fmt::Debug for Config {
             .finish()
     }
 }
-
-#[derive(Debug)]
-struct UnknownOption(String);
-
-impl fmt::Display for UnknownOption {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "unknown option `{}`", self.0)
-    }
-}
-
-impl error::Error for UnknownOption {}
-
-#[derive(Debug)]
-struct InvalidValue(&'static str);
-
-impl fmt::Display for InvalidValue {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "invalid value for option `{}`", self.0)
-    }
-}
-
-impl error::Error for InvalidValue {}
-
-struct Parser<'a> {
-    s: &'a str,
-    it: iter::Peekable<str::CharIndices<'a>>,
-}
-
-impl<'a> Parser<'a> {
-    fn parse(s: &'a str) -> Result<Config, Error> {
-        let mut parser = Parser {
-            s,
-            it: s.char_indices().peekable(),
-        };
-
-        let mut config = Config::new();
-
-        while let Some((key, value)) = parser.parameter()? {
-            config.param(key, &value)?;
-        }
-
-        Ok(config)
-    }
-
-    fn skip_ws(&mut self) {
-        self.take_while(char::is_whitespace);
-    }
-
-    fn take_while<F>(&mut self, f: F) -> &'a str
-    where
-        F: Fn(char) -> bool,
-    {
-        let start = match self.it.peek() {
-            Some(&(i, _)) => i,
-            None => return "",
-        };
-
-        loop {
-            match self.it.peek() {
-                Some(&(_, c)) if f(c) => {
-                    self.it.next();
-                }
-                Some(&(i, _)) => return &self.s[start..i],
-                None => return &self.s[start..],
-            }
-        }
-    }
-
-    fn eat(&mut self, target: char) -> Result<(), Error> {
-        match self.it.next() {
-            Some((_, c)) if c == target => Ok(()),
-            Some((i, c)) => {
-                let m = format!(
-                    "unexpected character at byte {}: expected `{}` but got `{}`",
-                    i, target, c
-                );
-                Err(Error::config_parse(m.into()))
-            }
-            None => Err(Error::config_parse("unexpected EOF".into())),
-        }
-    }
-
-    fn eat_if(&mut self, target: char) -> bool {
-        match self.it.peek() {
-            Some(&(_, c)) if c == target => {
-                self.it.next();
-                true
-            }
-            _ => false,
-        }
-    }
-
-    fn keyword(&mut self) -> Option<&'a str> {
-        let s = self.take_while(|c| match c {
-            c if c.is_whitespace() => false,
-            '=' => false,
-            _ => true,
-        });
-
-        if s.is_empty() {
-            None
-        } else {
-            Some(s)
-        }
-    }
-
-    fn value(&mut self) -> Result<String, Error> {
-        let value = if self.eat_if('\'') {
-            let value = self.quoted_value()?;
-            self.eat('\'')?;
-            value
-        } else {
-            self.simple_value()?
-        };
-
-        Ok(value)
-    }
-
-    fn simple_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c.is_whitespace() {
-                break;
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        if value.is_empty() {
-            return Err(Error::config_parse("unexpected EOF".into()));
-        }
-
-        Ok(value)
-    }
-
-    fn quoted_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c == '\'' {
-                return Ok(value);
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        Err(Error::config_parse(
-            "unterminated quoted connection parameter value".into(),
-        ))
-    }
-
-    fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
-        self.skip_ws();
-        let keyword = match self.keyword() {
-            Some(keyword) => keyword,
-            None => return Ok(None),
-        };
-        self.skip_ws();
-        self.eat('=')?;
-        self.skip_ws();
-        let value = self.value()?;
-
-        Ok(Some((keyword, value)))
-    }
-}
-
-// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
-struct UrlParser<'a> {
-    s: &'a str,
-    config: Config,
-}
-
-impl<'a> UrlParser<'a> {
-    fn parse(s: &'a str) -> Result<Option<Config>, Error> {
-        let s = match Self::remove_url_prefix(s) {
-            Some(s) => s,
-            None => return Ok(None),
-        };
-
-        let mut parser = UrlParser {
-            s,
-            config: Config::new(),
-        };
-
-        parser.parse_credentials()?;
-        parser.parse_host()?;
-        parser.parse_path()?;
-        parser.parse_params()?;
-
-        Ok(Some(parser.config))
-    }
-
-    fn remove_url_prefix(s: &str) -> Option<&str> {
-        for prefix in &["postgres://", "postgresql://"] {
-            if let Some(stripped) = s.strip_prefix(prefix) {
-                return Some(stripped);
-            }
-        }
-
-        None
-    }
-
-    fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
-        match self.s.find(end) {
-            Some(pos) => {
-                let (head, tail) = self.s.split_at(pos);
-                self.s = tail;
-                Some(head)
-            }
-            None => None,
-        }
-    }
-
-    fn take_all(&mut self) -> &'a str {
-        mem::take(&mut self.s)
-    }
-
-    fn eat_byte(&mut self) {
-        self.s = &self.s[1..];
-    }
-
-    fn parse_credentials(&mut self) -> Result<(), Error> {
-        let creds = match self.take_until(&['@']) {
-            Some(creds) => creds,
-            None => return Ok(()),
-        };
-        self.eat_byte();
-
-        let mut it = creds.splitn(2, ':');
-        let user = self.decode(it.next().unwrap())?;
-        self.config.user(&user);
-
-        if let Some(password) = it.next() {
-            let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
-            self.config.password(password);
-        }
-
-        Ok(())
-    }
-
-    fn parse_host(&mut self) -> Result<(), Error> {
-        let host = match self.take_until(&['/', '?']) {
-            Some(host) => host,
-            None => self.take_all(),
-        };
-
-        if host.is_empty() {
-            return Ok(());
-        }
-
-        for chunk in host.split(',') {
-            let (host, port) = if chunk.starts_with('[') {
-                let idx = match chunk.find(']') {
-                    Some(idx) => idx,
-                    None => return Err(Error::config_parse(InvalidValue("host").into())),
-                };
-
-                let host = &chunk[1..idx];
-                let remaining = &chunk[idx + 1..];
-                let port = if let Some(port) = remaining.strip_prefix(':') {
-                    Some(port)
-                } else if remaining.is_empty() {
-                    None
-                } else {
-                    return Err(Error::config_parse(InvalidValue("host").into()));
-                };
-
-                (host, port)
-            } else {
-                let mut it = chunk.splitn(2, ':');
-                (it.next().unwrap(), it.next())
-            };
-
-            self.host_param(host)?;
-            let port = self.decode(port.unwrap_or("5432"))?;
-            self.config.param("port", &port)?;
-        }
-
-        Ok(())
-    }
-
-    fn parse_path(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('/') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        let dbname = match self.take_until(&['?']) {
-            Some(dbname) => dbname,
-            None => self.take_all(),
-        };
-
-        if !dbname.is_empty() {
-            self.config.dbname(&self.decode(dbname)?);
-        }
-
-        Ok(())
-    }
-
-    fn parse_params(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('?') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        while !self.s.is_empty() {
-            let key = match self.take_until(&['=']) {
-                Some(key) => self.decode(key)?,
-                None => return Err(Error::config_parse("unterminated parameter".into())),
-            };
-            self.eat_byte();
-
-            let value = match self.take_until(&['&']) {
-                Some(value) => {
-                    self.eat_byte();
-                    value
-                }
-                None => self.take_all(),
-            };
-
-            if key == "host" {
-                self.host_param(value)?;
-            } else {
-                let value = self.decode(value)?;
-                self.config.param(&key, &value)?;
-            }
-        }
-
-        Ok(())
-    }
-
-    fn host_param(&mut self, s: &str) -> Result<(), Error> {
-        let s = self.decode(s)?;
-        self.config.param("host", &s)
-    }
-
-    fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
-        percent_encoding::percent_decode(s.as_bytes())
-            .decode_utf8()
-            .map_err(|e| Error::config_parse(e.into()))
-    }
-}
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 9c6f1a2552..390f133002 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -7,7 +7,6 @@ use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
-use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
 use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
@@ -174,25 +173,11 @@ where
 
             authenticate_password(stream, pass).await?;
         }
-        Some(Message::AuthenticationMd5Password(body)) => {
-            can_skip_channel_binding(config)?;
-
-            let user = config
-                .user
-                .as_ref()
-                .ok_or_else(|| Error::config("user missing".into()))?;
-            let pass = config
-                .password
-                .as_ref()
-                .ok_or_else(|| Error::config("password missing".into()))?;
-
-            let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
-            authenticate_password(stream, output.as_bytes()).await?;
-        }
         Some(Message::AuthenticationSasl(body)) => {
             authenticate_sasl(stream, body, config).await?;
         }
-        Some(Message::AuthenticationKerberosV5)
+        Some(Message::AuthenticationMd5Password)
+        | Some(Message::AuthenticationKerberosV5)
         | Some(Message::AuthenticationScmCredential)
         | Some(Message::AuthenticationGss)
         | Some(Message::AuthenticationSspi) => {
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
index 6514322250..922c348525 100644
--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -349,7 +349,6 @@ enum Kind {
     Parse,
     Encode,
     Authentication,
-    ConfigParse,
     Config,
     Connect,
     Timeout,
@@ -386,7 +385,6 @@ impl fmt::Display for Error {
             Kind::Parse => fmt.write_str("error parsing response from server")?,
             Kind::Encode => fmt.write_str("error encoding message to server")?,
             Kind::Authentication => fmt.write_str("authentication error")?,
-            Kind::ConfigParse => fmt.write_str("invalid connection string")?,
             Kind::Config => fmt.write_str("invalid configuration")?,
             Kind::Connect => fmt.write_str("error connecting to server")?,
             Kind::Timeout => fmt.write_str("timeout waiting for server")?,
@@ -482,10 +480,6 @@ impl Error {
         Error::new(Kind::Authentication, Some(e))
     }
 
-    pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
-        Error::new(Kind::ConfigParse, Some(e))
-    }
-
     pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
         Error::new(Kind::Config, Some(e))
     }
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 57c639a7de..901ed0c96c 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -13,14 +13,12 @@ pub use crate::query::RowStream;
 pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
-use crate::tls::MakeTlsConnect;
 pub use crate::tls::NoTls;
 pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
 use postgres_protocol2::message::backend::ReadyForQueryBody;
-use tokio::net::TcpStream;
 
 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -72,24 +70,6 @@ mod transaction;
 mod transaction_builder;
 pub mod types;
 
-/// A convenience function which parses a connection string and connects to the database.
-///
-/// See the documentation for [`Config`] for details on the connection string format.
-///
-/// Requires the `runtime` Cargo feature (enabled by default).
-///
-/// [`Config`]: config/struct.Config.html
-pub async fn connect<T>(
-    config: &str,
-    tls: T,
-) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
-where
-    T: MakeTlsConnect<TcpStream>,
-{
-    let config = config.parse::<Config>()?;
-    config.connect(tls).await
-}
-
 /// An asynchronous notification.
 #[derive(Clone, Debug)]
 pub struct Notification {
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f5934c8a89..2f63ee3acc 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -6,7 +6,7 @@ license.workspace = true
 
 [features]
 default = []
-testing = []
+testing = ["dep:tokio-postgres"]
 
 [dependencies]
 ahash.workspace = true
@@ -55,6 +55,7 @@ parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 prometheus.workspace = true
@@ -81,7 +82,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
+tokio-postgres = { workspace = true, optional = true }
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -119,3 +120,4 @@ rcgen.workspace = true
 rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
+tokio-postgres.workspace = true
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 491b272ac4..5e494dfdd6 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -66,7 +66,7 @@ pub(super) async fn authenticate(
 
     Ok(ComputeCredentials {
         info: creds,
-        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+        keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256(
             scram_keys,
         )),
     })
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index bf7a1cb070..494564de05 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,8 +1,8 @@
 use async_trait::async_trait;
+use postgres_client::config::SslMode;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7e1b26a11a..84a572dcf9 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -11,8 +11,8 @@ pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
+use postgres_client::config::AuthKeys;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::AuthKeys;
 use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 9c6ce151cb..60d1962d7f 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -227,7 +227,7 @@ pub(crate) async fn validate_password_and_exchange(
             };
 
             Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
-                tokio_postgres::config::AuthKeys::ScramSha256(keys),
+                postgres_client::config::AuthKeys::ScramSha256(keys),
             )))
         }
     }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 91e198bf88..bcb0ef40bd 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,11 +3,11 @@ use std::sync::Arc;
 
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
+use postgres_client::{CancelToken, NoTls};
 use pq_proto::CancelKeyData;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
-use tokio_postgres::{CancelToken, NoTls};
 use tracing::{debug, info};
 use uuid::Uuid;
 
@@ -44,7 +44,7 @@ pub(crate) enum CancelError {
     IO(#[from] std::io::Error),
 
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("rate limit exceeded")]
     RateLimit,
@@ -70,7 +70,7 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
+        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
         // actual backend_pid, but backend_pid is not used for anything
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b689b97a21..06bc71c559 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,8 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_client::tls::MakeTlsConnect;
+use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
@@ -13,8 +15,6 @@ use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -34,9 +34,9 @@ pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 #[derive(Debug, Error)]
 pub(crate) enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
-    /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
+    /// `postgres_client::error::Kind` doesn't contain ip addresses and such.
     #[error("{COULD_NOT_CONNECT}: {0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
@@ -99,13 +99,13 @@ impl ReportableError for ConnectionError {
 }
 
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
-pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 
 /// A config for establishing a connection to compute node.
-/// Eventually, `tokio_postgres` will be replaced with something better.
+/// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[derive(Clone, Default)]
-pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
+pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
@@ -126,7 +126,7 @@ impl ConnCfg {
 
     pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
         match self.0.get_hosts() {
-            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
             // we should not have multiple address or unix addresses.
             _ => Err(WakeComputeError::BadComputeAddress(
                 "invalid compute address".into(),
@@ -160,7 +160,7 @@ impl ConnCfg {
 
         // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
-            use tokio_postgres::config::ReplicationMode;
+            use postgres_client::config::ReplicationMode;
             match replication {
                 "true" | "on" | "yes" | "1" => {
                     self.replication_mode(ReplicationMode::Physical);
@@ -182,7 +182,7 @@ impl ConnCfg {
 }
 
 impl std::ops::Deref for ConnCfg {
-    type Target = tokio_postgres::Config;
+    type Target = postgres_client::Config;
 
     fn deref(&self) -> &Self::Target {
         &self.0
@@ -199,7 +199,7 @@ impl std::ops::DerefMut for ConnCfg {
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
     async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
-        use tokio_postgres::config::Host;
+        use postgres_client::config::Host;
 
         // wrap TcpStream::connect with timeout
         let connect_with_timeout = |host, port| {
@@ -224,7 +224,7 @@ impl ConnCfg {
             })
         };
 
-        // We can't reuse connection establishing logic from `tokio_postgres` here,
+        // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
         let mut connection_error = None;
@@ -272,7 +272,7 @@ type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>
 pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
     pub(crate) stream:
-        tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
+        postgres_client::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
     /// PostgreSQL connection parameters.
     pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 9537d717a1..4d55f96ca1 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -5,7 +5,6 @@ use std::sync::Arc;
 
 use futures::TryFutureExt;
 use thiserror::Error;
-use tokio_postgres::config::SslMode;
 use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
@@ -165,7 +164,7 @@ impl MockControlPlane {
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
             .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(SslMode::Disable);
+            .ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 2cad981d01..5a78ec9d32 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -6,8 +6,8 @@ use std::time::Duration;
 use ::http::header::AUTHORIZATION;
 use ::http::HeaderName;
 use futures::TryFutureExt;
+use postgres_client::config::SslMode;
 use tokio::time::Instant;
-use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 2221aac407..6a379499dc 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -84,7 +84,7 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
-impl ReportableError for tokio_postgres::error::Error {
+impl ReportableError for postgres_client::error::Error {
     fn get_error_kind(&self) -> ErrorKind {
         if self.as_db_error().is_some() {
             ErrorKind::Postgres
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs
index 31e7915e89..5ef20991c3 100644
--- a/proxy/src/postgres_rustls/mod.rs
+++ b/proxy/src/postgres_rustls/mod.rs
@@ -1,10 +1,10 @@
 use std::convert::TryFrom;
 use std::sync::Arc;
 
+use postgres_client::tls::MakeTlsConnect;
 use rustls::pki_types::ServerName;
 use rustls::ClientConfig;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::tls::MakeTlsConnect;
 
 mod private {
     use std::future::Future;
@@ -12,9 +12,9 @@ mod private {
     use std::pin::Pin;
     use std::task::{Context, Poll};
 
+    use postgres_client::tls::{ChannelBinding, TlsConnect};
     use rustls::pki_types::ServerName;
     use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-    use tokio_postgres::tls::{ChannelBinding, TlsConnect};
     use tokio_rustls::client::TlsStream;
     use tokio_rustls::TlsConnector;
 
@@ -59,7 +59,7 @@ mod private {
 
     pub struct RustlsStream<S>(TlsStream<S>);
 
-    impl<S> tokio_postgres::tls::TlsStream for RustlsStream<S>
+    impl<S> postgres_client::tls::TlsStream for RustlsStream<S>
     where
         S: AsyncRead + AsyncWrite + Unpin,
     {
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index d3f0c3e7d4..42d1491782 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -31,9 +31,9 @@ impl CouldRetry for io::Error {
     }
 }
 
-impl CouldRetry for tokio_postgres::error::DbError {
+impl CouldRetry for postgres_client::error::DbError {
     fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         matches!(
             self.code(),
             &SqlState::CONNECTION_FAILURE
@@ -43,9 +43,9 @@ impl CouldRetry for tokio_postgres::error::DbError {
         )
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
+impl ShouldRetryWakeCompute for postgres_client::error::DbError {
     fn should_retry_wake_compute(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         // Here are errors that happens after the user successfully authenticated to the database.
         // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
         !matches!(
@@ -61,21 +61,21 @@ impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
     }
 }
 
-impl CouldRetry for tokio_postgres::Error {
+impl CouldRetry for postgres_client::Error {
     fn could_retry(&self) -> bool {
         if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
             io::Error::could_retry(io_err)
         } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
+            postgres_client::error::DbError::could_retry(db_err)
         } else {
             false
         }
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::Error {
+impl ShouldRetryWakeCompute for postgres_client::Error {
     fn should_retry_wake_compute(&self) -> bool {
         if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::should_retry_wake_compute(db_err)
+            postgres_client::error::DbError::should_retry_wake_compute(db_err)
         } else {
             // likely an IO error. Possible the compute has shutdown and the
             // cache is stale.
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index fe211adfeb..ef351f3b54 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -8,9 +8,9 @@ use std::fmt::Debug;
 
 use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
+use postgres_client::tls::TlsConnect;
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
 use super::*;
@@ -158,8 +158,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _client_err = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -175,7 +175,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -185,7 +185,7 @@ async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -195,7 +195,7 @@ async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -205,7 +205,7 @@ async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Resul
 async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -215,7 +215,7 @@ async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -225,14 +225,14 @@ async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
 
 async fn connect_failure(
     intercept: Intercept,
-    channel_binding: tokio_postgres::config::ChannelBinding,
+    channel_binding: postgres_client::config::ChannelBinding,
 ) -> anyhow::Result<()> {
     let (server, client, client_config, server_config) = proxy_mitm(intercept).await;
     let proxy = tokio::spawn(dummy_proxy(
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 15be6c9724..c8b742b3ff 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -7,13 +7,13 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use http::StatusCode;
+use postgres_client::config::SslMode;
+use postgres_client::tls::{MakeTlsConnect, NoTls};
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
-use tokio_postgres::config::SslMode;
-use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = tokio_postgres::Config::new()
+    let client_err = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,8 +296,8 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Require)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
         .password(password)
@@ -320,8 +320,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 57846a4c2c..8c7931907d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -37,9 +37,9 @@ use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
-    pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
+    pub(crate) local_pool: Arc<LocalConnPool<postgres_client::Client>>,
     pub(crate) pool:
-        Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+        Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
 
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -170,7 +170,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         let maybe_client = if force_new {
             debug!("pool: pool is disabled");
             None
@@ -256,7 +256,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestContext,
         conn_info: ConnInfo,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
             return Ok(client);
         }
@@ -315,7 +315,7 @@ impl PoolingBackend {
             ));
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        let (client, connection) = config.connect(postgres_client::NoTls).await?;
         drop(pause);
 
         let pid = client.get_process_id();
@@ -360,7 +360,7 @@ pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
     #[error("could not connection to postgres in compute")]
-    PostgresConnectionError(#[from] tokio_postgres::Error),
+    PostgresConnectionError(#[from] postgres_client::Error),
     #[error("could not connection to local-proxy in compute")]
     LocalProxyConnectionError(#[from] LocalProxyConnError),
     #[error("could not parse JWT payload")]
@@ -479,7 +479,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError {
 }
 
 struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+    pool: Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -489,7 +489,7 @@ struct TokioMechanism {
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
-    type Connection = Client<tokio_postgres::Client>;
+    type Connection = Client<postgres_client::Client>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
@@ -509,7 +509,7 @@ impl ConnectMechanism for TokioMechanism {
             .connect_timeout(timeout);
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(tokio_postgres::NoTls).await;
+        let res = config.connect(postgres_client::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c302eac568..cac5a173cb 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -5,11 +5,11 @@ use std::task::{ready, Poll};
 
 use futures::future::poll_fn;
 use futures::Future;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::AsyncMessage;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 #[cfg(test)]
@@ -58,7 +58,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index fe1d2563bc..2a46c8f9c5 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -7,8 +7,8 @@ use std::time::Duration;
 
 use dashmap::DashMap;
 use parking_lot::RwLock;
+use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
-use tokio_postgres::ReadyForQueryStatus;
 use tracing::{debug, info, Span};
 
 use super::backend::HttpConnError;
@@ -683,7 +683,7 @@ pub(crate) trait ClientInnerExt: Sync + Send + 'static {
     fn get_process_id(&self) -> i32;
 }
 
-impl ClientInnerExt for tokio_postgres::Client {
+impl ClientInnerExt for postgres_client::Client {
     fn is_closed(&self) -> bool {
         self.is_closed()
     }
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 569e2da571..25b25c66d3 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,6 @@
+use postgres_client::types::{Kind, Type};
+use postgres_client::Row;
 use serde_json::{Map, Value};
-use tokio_postgres::types::{Kind, Type};
-use tokio_postgres::Row;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum JsonConversionError {
     #[error("internal error compute returned invalid data: {0}")]
-    AsTextError(tokio_postgres::Error),
+    AsTextError(postgres_client::Error),
     #[error("parse int error: {0}")]
     ParseIntError(#[from] std::num::ParseIntError),
     #[error("parse float error: {0}")]
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index db9ac49dae..b84cde9e25 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -22,13 +22,13 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::types::ToSql;
+use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::types::ToSql;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
@@ -164,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
@@ -280,7 +280,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     )
 }
 
-impl ClientInnerCommon<tokio_postgres::Client> {
+impl ClientInnerCommon<postgres_client::Client> {
     pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
         if let ClientDataEnum::Local(local_data) = &mut self.data {
             local_data.jti += 1;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index a0ca7cc60d..5e85f5ec40 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -11,12 +11,12 @@ use http_body_util::{BodyExt, Full};
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
+use postgres_client::error::{DbError, ErrorPosition, SqlState};
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use tokio::time::{self, Instant};
-use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
-use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 use typed_json::json;
@@ -361,7 +361,7 @@ pub(crate) enum SqlOverHttpError {
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
     #[error("{0}")]
     JsonConversion(#[from] JsonConversionError),
     #[error("{0}")]
@@ -986,7 +986,7 @@ async fn query_to_json<T: GenericClient>(
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut rows: Vec<postgres_client::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
         *current_size += row.body_len();
@@ -1063,13 +1063,13 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
-    Local(conn_pool_lib::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<postgres_client::Client>),
+    Local(conn_pool_lib::Client<postgres_client::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
-    Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, postgres_client::Client>),
+    Local(conn_pool_lib::Discard<'a, postgres_client::Client>),
 }
 
 impl Client {
@@ -1080,7 +1080,7 @@ impl Client {
         }
     }
 
-    fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+    fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
                 let (c, d) = client.inner();

From f312c6571f45395f4a5adfb2b0450741c16ebd58 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 19:47:17 +0100
Subject: [PATCH 044/117] pageserver: respond to multiple shutdown signals
 (#9982)

## Problem

The Pageserver signal handler would only respond to a single signal and
initiate shutdown. Subsequent signals were ignored. This meant that a
`SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a
slow or stalled shutdown). The `test_runner` uses this to force shutdown
if graceful shutdown is slow.

Touches #9740.

## Summary of changes

Keep responding to signals after the initial shutdown signal has been
received.

Arguably, the `test_runner` should also use `SIGKILL` rather than
`SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT`
regardless.
---
 pageserver/src/bin/pageserver.rs | 76 +++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 8fe225c6aa..567a69da3b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -636,45 +636,59 @@ fn start_pageserver(
         tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
     });
 
-    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
-
     // All started up! Now just sit and wait for shutdown signal.
+    BACKGROUND_RUNTIME.block_on(async move {
+        let signal_token = CancellationToken::new();
+        let signal_cancel = signal_token.child_token();
 
-    {
-        BACKGROUND_RUNTIME.block_on(async move {
+        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
+        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
+        // https://github.com/neondatabase/neon/issues/9740.
+        tokio::spawn(async move {
             let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
             let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
             let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
+
+            loop {
+                let signal = tokio::select! {
+                    _ = sigquit.recv() => {
+                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                        std::process::exit(111);
+                    }
+                    _ = sigint.recv() => "SIGINT",
+                    _ = sigterm.recv() => "SIGTERM",
+                };
+
+                if !signal_token.is_cancelled() {
+                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+                    signal_token.cancel();
+                } else {
+                    info!("Got signal {signal}. Already shutting down.");
                 }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
+            }
+        });
 
-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
+        // Wait for cancellation signal and shut down the pageserver.
+        //
+        // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
+        // reach very far, and `task_mgr` is used instead. The plan is to change that over time.
+        signal_cancel.cancelled().await;
 
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(
-                http_endpoint_listener,
-                page_service,
-                consumption_metrics_tasks,
-                disk_usage_eviction_task,
-                &tenant_manager,
-                background_purges,
-                deletion_queue.clone(),
-                secondary_controller_tasks,
-                0,
-            )
-            .await;
-            unreachable!()
-        })
-    }
+        shutdown_pageserver.cancel();
+        pageserver::shutdown_pageserver(
+            http_endpoint_listener,
+            page_service,
+            consumption_metrics_tasks,
+            disk_usage_eviction_task,
+            &tenant_manager,
+            background_purges,
+            deletion_queue.clone(),
+            secondary_controller_tasks,
+            0,
+        )
+        .await;
+        unreachable!();
+    })
 }
 
 async fn create_remote_storage_client(

From 3baef0bca3e9217519f72734c773a6a1f880c90f Mon Sep 17 00:00:00 2001
From: Alexey Immoreev <lexx92@mail.ru>
Date: Tue, 3 Dec 2024 22:59:44 +0400
Subject: [PATCH 045/117] Improvement: add console redirect timeout warning
 (#9985)

## Problem

There is no information on session being cancelled in 2 minutes at the
moment

## Summary of changes

The timeout being logged for the user
---
 proxy/src/auth/backend/console_redirect.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 494564de05..619c7b4ef1 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -49,13 +49,19 @@ impl ReportableError for ConsoleRedirectError {
     }
 }
 
-fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
+fn hello_message(
+    redirect_uri: &reqwest::Url,
+    session_id: &str,
+    duration: std::time::Duration,
+) -> String {
+    let formatted_duration = humantime::format_duration(duration).to_string();
     format!(
         concat![
             "Welcome to Neon!\n",
-            "Authenticate by visiting:\n",
+            "Authenticate by visiting (will expire in {duration}):\n",
             "    {redirect_uri}{session_id}\n\n",
         ],
+        duration = formatted_duration,
         redirect_uri = redirect_uri,
         session_id = session_id,
     )
@@ -118,7 +124,11 @@ async fn authenticate(
     };
 
     let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
-    let greeting = hello_message(link_uri, &psql_session_id);
+    let greeting = hello_message(
+        link_uri,
+        &psql_session_id,
+        auth_config.console_redirect_confirmation_timeout,
+    );
 
     // Give user a URL to spawn a new database.
     info!(parent: &span, "sending the auth URL to the user");

From 9ef0662a42585aa20a68db9243b7623cc5bd6c56 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 20:00:14 +0000
Subject: [PATCH 046/117] chore(proxy): enforce single host+port (#9995)

proxy doesn't ever provide multiple hosts/ports, so this code adds a lot
of complexity of error handling for no good reason.

(stacked on #9990)
---
 libs/proxy/tokio-postgres2/src/config.rs   | 41 ++++-----------
 libs/proxy/tokio-postgres2/src/connect.rs  | 42 ++++-----------
 proxy/src/auth/backend/console_redirect.rs |  8 +--
 proxy/src/auth/backend/local.rs            |  7 +--
 proxy/src/compute.rs                       | 59 ++++++----------------
 proxy/src/control_plane/client/mock.rs     | 10 ++--
 proxy/src/control_plane/client/neon.rs     |  4 +-
 proxy/src/proxy/connect_compute.rs         |  2 +-
 proxy/src/proxy/tests/mitm.rs              |  4 +-
 proxy/src/proxy/tests/mod.rs               | 14 ++---
 proxy/src/serverless/backend.rs            | 10 ++--
 11 files changed, 59 insertions(+), 142 deletions(-)

diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 5dad835c3b..fd10ef6f20 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -146,6 +146,9 @@ pub enum AuthKeys {
 /// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host: Host,
+    pub(crate) port: u16,
+
     pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
@@ -153,8 +156,6 @@ pub struct Config {
     pub(crate) options: Option<String>,
     pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
-    pub(crate) host: Vec<Host>,
-    pub(crate) port: Vec<u16>,
     pub(crate) connect_timeout: Option<Duration>,
     pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
@@ -162,16 +163,12 @@ pub struct Config {
     pub(crate) max_backend_message_size: Option<usize>,
 }
 
-impl Default for Config {
-    fn default() -> Config {
-        Config::new()
-    }
-}
-
 impl Config {
     /// Creates a new configuration.
-    pub fn new() -> Config {
+    pub fn new(host: String, port: u16) -> Config {
         Config {
+            host: Host::Tcp(host),
+            port,
             user: None,
             password: None,
             auth_keys: None,
@@ -179,8 +176,6 @@ impl Config {
             options: None,
             application_name: None,
             ssl_mode: SslMode::Prefer,
-            host: vec![],
-            port: vec![],
             connect_timeout: None,
             target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
@@ -283,32 +278,14 @@ impl Config {
         self.ssl_mode
     }
 
-    /// Adds a host to the configuration.
-    ///
-    /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
-    pub fn host(&mut self, host: &str) -> &mut Config {
-        self.host.push(Host::Tcp(host.to_string()));
-        self
-    }
-
     /// Gets the hosts that have been added to the configuration with `host`.
-    pub fn get_hosts(&self) -> &[Host] {
+    pub fn get_host(&self) -> &Host {
         &self.host
     }
 
-    /// Adds a port to the configuration.
-    ///
-    /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
-    /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
-    /// as hosts.
-    pub fn port(&mut self, port: u16) -> &mut Config {
-        self.port.push(port);
-        self
-    }
-
     /// Gets the ports that have been added to the configuration with `port`.
-    pub fn get_ports(&self) -> &[u16] {
-        &self.port
+    pub fn get_port(&self) -> u16 {
+        self.port
     }
 
     /// Sets the timeout applied to socket-level connection attempts.
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 98067d91f9..75a58e6eac 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -19,38 +19,18 @@ pub async fn connect<T>(
 where
     T: MakeTlsConnect<TcpStream>,
 {
-    if config.host.is_empty() {
-        return Err(Error::config("host missing".into()));
+    let hostname = match &config.host {
+        Host::Tcp(host) => host.as_str(),
+    };
+
+    let tls = tls
+        .make_tls_connect(hostname)
+        .map_err(|e| Error::tls(e.into()))?;
+
+    match connect_once(&config.host, config.port, tls, config).await {
+        Ok((client, connection)) => Ok((client, connection)),
+        Err(e) => Err(e),
     }
-
-    if config.port.len() > 1 && config.port.len() != config.host.len() {
-        return Err(Error::config("invalid number of ports".into()));
-    }
-
-    let mut error = None;
-    for (i, host) in config.host.iter().enumerate() {
-        let port = config
-            .port
-            .get(i)
-            .or_else(|| config.port.first())
-            .copied()
-            .unwrap_or(5432);
-
-        let hostname = match host {
-            Host::Tcp(host) => host.as_str(),
-        };
-
-        let tls = tls
-            .make_tls_connect(hostname)
-            .map_err(|e| Error::tls(e.into()))?;
-
-        match connect_once(host, port, tls, config).await {
-            Ok((client, connection)) => return Ok((client, connection)),
-            Err(e) => error = Some(e),
-        }
-    }
-
-    Err(error.unwrap())
 }
 
 async fn connect_once<T>(
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 619c7b4ef1..575d60be85 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -161,12 +161,8 @@ async fn authenticate(
 
     // This config should be self-contained, because we won't
     // take username or dbname from client's startup message.
-    let mut config = compute::ConnCfg::new();
-    config
-        .host(&db_info.host)
-        .port(db_info.port)
-        .dbname(&db_info.dbname)
-        .user(&db_info.user);
+    let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
+    config.dbname(&db_info.dbname).user(&db_info.user);
 
     ctx.set_dbname(db_info.dbname.into());
     ctx.set_user(db_info.user.into());
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 32e0f53615..d4273fb521 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -29,12 +29,7 @@ impl LocalBackend {
                 api: http::Endpoint::new(compute_ctl, http::new_client()),
             },
             node_info: NodeInfo {
-                config: {
-                    let mut cfg = ConnCfg::new();
-                    cfg.host(&postgres_addr.ip().to_string());
-                    cfg.port(postgres_addr.port());
-                    cfg
-                },
+                config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()),
                 // TODO(conrad): make this better reflect compute info rather than endpoint info.
                 aux: MetricsAuxInfo {
                     endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 06bc71c559..ab0ff4b795 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -104,13 +104,13 @@ pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
-    pub(crate) fn new() -> Self {
-        Self::default()
+    pub(crate) fn new(host: String, port: u16) -> Self {
+        Self(Box::new(postgres_client::Config::new(host, port)))
     }
 
     /// Reuse password or auth keys from the other config.
@@ -124,13 +124,9 @@ impl ConnCfg {
         }
     }
 
-    pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
-        match self.0.get_hosts() {
-            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
-            // we should not have multiple address or unix addresses.
-            _ => Err(WakeComputeError::BadComputeAddress(
-                "invalid compute address".into(),
-            )),
+    pub(crate) fn get_host(&self) -> Host {
+        match self.0.get_host() {
+            postgres_client::config::Host::Tcp(s) => s.into(),
         }
     }
 
@@ -227,43 +223,20 @@ impl ConnCfg {
         // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
-        let mut connection_error = None;
-        let ports = self.0.get_ports();
-        let hosts = self.0.get_hosts();
-        // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
-        if ports.len() > 1 && ports.len() != hosts.len() {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
-                format!(
-                    "bad compute config, \
-                     ports and hosts entries' count does not match: {:?}",
-                    self.0
-                ),
-            ));
-        }
+        let port = self.0.get_port();
+        let host = self.0.get_host();
 
-        for (i, host) in hosts.iter().enumerate() {
-            let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
-            let host = match host {
-                Host::Tcp(host) => host.as_str(),
-            };
+        let host = match host {
+            Host::Tcp(host) => host.as_str(),
+        };
 
-            match connect_once(host, *port).await {
-                Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
-                Err(err) => {
-                    // We can't throw an error here, as there might be more hosts to try.
-                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
-                    connection_error = Some(err);
-                }
+        match connect_once(host, port).await {
+            Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
+            Err(err) => {
+                warn!("couldn't connect to compute node at {host}:{port}: {err}");
+                Err(err)
             }
         }
-
-        Err(connection_error.unwrap_or_else(|| {
-            io::Error::new(
-                io::ErrorKind::Other,
-                format!("bad compute config: {:?}", self.0),
-            )
-        }))
     }
 }
 
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 4d55f96ca1..eaf692ab27 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -160,11 +160,11 @@ impl MockControlPlane {
     }
 
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new();
-        config
-            .host(self.endpoint.host_str().unwrap_or("localhost"))
-            .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(postgres_client::config::SslMode::Disable);
+        let mut config = compute::ConnCfg::new(
+            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
+            self.endpoint.port().unwrap_or(5432),
+        );
+        config.ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5a78ec9d32..5c204ae1d7 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -241,8 +241,8 @@ impl NeonControlPlaneClient {
             // Don't set anything but host and port! This config will be cached.
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 2e759b0894..585dce7bae 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -86,7 +86,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
         permit.release_result(node_info.connect(ctx, timeout).await)
     }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ef351f3b54..d72331c7bf 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -158,7 +158,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index c8b742b3ff..53345431e3 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = postgres_client::Config::new()
+    let client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
@@ -546,7 +546,7 @@ impl TestControlPlaneClient for TestConnectMechanism {
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
-        config: compute::ConnCfg::new(),
+        config: compute::ConnCfg::new("test".to_owned(), 5432),
         aux: MetricsAuxInfo {
             endpoint_id: (&EndpointId::from("endpoint")).into(),
             project_id: (&ProjectId::from("project")).into(),
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8c7931907d..55d2e47fd3 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -499,7 +499,7 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let mut config = (*node_info.config).clone();
@@ -549,16 +549,12 @@ impl ConnectMechanism for HyperMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
-        let port = *node_info.config.get_ports().first().ok_or_else(|| {
-            HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress(
-                "local-proxy port missing on compute address".into(),
-            ))
-        })?;
+        let port = node_info.config.get_port();
         let res = connect_http2(&host, port, timeout).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;

From ca85f364ba3fd0ed41c2be9995722725cbaee78f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:39:10 +0100
Subject: [PATCH 047/117] Support tenant manifests in the scrubber (#9942)

Support tenant manifests in the storage scrubber:

* list the manifests, order them by generation
* delete all manifests except for the two most recent generations
* for the latest manifest: try parsing it.

I've tested this patch by running the against a staging bucket and it
successfully deleted stuff (and avoided deleting the latest two
generations).

In follow-up work, we might want to also check some invariants of the
manifest, as mentioned in #8088.

Part of #9386
Part of #8088

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 .../tenant/remote_timeline_client/manifest.rs |   2 +-
 storage_scrubber/src/checks.rs                | 135 ++++++++-
 .../src/pageserver_physical_gc.rs             | 258 ++++++++++++++----
 test_runner/regress/test_timeline_archive.py  | 114 ++++++++
 5 files changed, 459 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 007bd3eef0..4bb1bbf3cf 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2564,9 +2564,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
 }
 
 /// Given the key of a tenant manifest, parse out the generation number
-pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
     static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap());
     re.captures(path.get_path().as_str())
         .and_then(|c| c.get(1))
         .and_then(|m| Generation::parse_suffix(m.as_str()))
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index c4382cb648..2029847a12 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -43,7 +43,7 @@ impl TenantManifest {
             offloaded_timelines: vec![],
         }
     }
-    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
         serde_json::from_slice::<Self>(bytes)
     }
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 8d855d263c..1b4ff01a17 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -4,17 +4,21 @@ use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
+use utils::shard::TenantShardId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
@@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl(
         unknown_keys,
     }))
 }
+
+pub(crate) struct RemoteTenantManifestInfo {
+    pub(crate) latest_generation: Option<Generation>,
+    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+}
+
+pub(crate) enum ListTenantManifestResult {
+    WithErrors {
+        errors: Vec<(String, String)>,
+        #[allow(dead_code)]
+        unknown_keys: Vec<ListingObject>,
+    },
+    NoErrors(RemoteTenantManifestInfo),
+}
+
+/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
+pub(crate) async fn list_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    tenant_id: TenantShardId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTenantManifestResult> {
+    let mut errors = Vec::new();
+    let mut unknown_keys = Vec::new();
+
+    let mut tenant_root_target = root_target.tenant_root(&tenant_id);
+    let original_prefix = tenant_root_target.prefix_in_bucket.clone();
+    const TENANT_MANIFEST_STEM: &str = "tenant-manifest";
+    tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM;
+    tenant_root_target.delimiter = String::new();
+
+    let mut manifests: Vec<(Generation, ListingObject)> = Vec::new();
+
+    let prefix_str = &original_prefix
+        .strip_prefix("/")
+        .unwrap_or(&original_prefix);
+
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target));
+    'outer: while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        'err: {
+            // TODO a let chain would be nicer here.
+            let Some(name) = key.object_name() else {
+                break 'err;
+            };
+            if !name.starts_with(TENANT_MANIFEST_STEM) {
+                break 'err;
+            }
+            let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else {
+                break 'err;
+            };
+            tracing::debug!("tenant manifest {key}");
+            manifests.push((generation, obj));
+            continue 'outer;
+        }
+        tracing::info!("Listed an unknown key: {key}");
+        unknown_keys.push(obj);
+    }
+
+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+    if !unknown_keys.is_empty() {
+        errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+
+    // Find the manifest with the highest generation
+    let (latest_generation, latest_listing_object) = manifests
+        .iter()
+        .max_by_key(|i| i.0)
+        .map(|(g, obj)| (*g, obj.clone()))
+        .unwrap();
+
+    let manifest_bytes =
+        match download_object_with_retries(remote_client, &latest_listing_object.key).await {
+            Ok(bytes) => bytes,
+            Err(e) => {
+                // It is possible that the tenant gets deleted in-between we list the objects
+                // and we download the manifest file.
+                errors.push((
+                    latest_listing_object.key.get_path().as_str().to_owned(),
+                    format!("failed to download tenant-manifest.json: {e}"),
+                ));
+                return Ok(ListTenantManifestResult::WithErrors {
+                    errors,
+                    unknown_keys,
+                });
+            }
+        };
+
+    match TenantManifest::from_json_bytes(&manifest_bytes) {
+        Ok(_manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors(
+                RemoteTenantManifestInfo {
+                    latest_generation: Some(latest_generation),
+                    manifests,
+                },
+            ));
+        }
+        Err(parse_error) => errors.push((
+            latest_listing_object.key.get_path().as_str().to_owned(),
+            format!("tenant-manifest.json body parsing error: {parse_error}"),
+        )),
+    }
+
+    if errors.is_empty() {
+        errors.push((
+            (*prefix_str).to_owned(),
+            "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
+        ));
+    }
+
+    Ok(ListTenantManifestResult::WithErrors {
+        errors,
+        unknown_keys,
+    })
+}
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 1e69ddbf15..20cb9c3633 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,12 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::checks::{
+    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
@@ -25,6 +29,7 @@ use utils::id::{TenantId, TenantTimelineId};
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
+    tenant_manifests_deleted: usize,
     remote_storage_errors: usize,
     controller_api_errors: usize,
     ancestor_layers_deleted: usize,
@@ -34,12 +39,14 @@ impl GcSummary {
     fn merge(&mut self, other: Self) {
         let Self {
             indices_deleted,
+            tenant_manifests_deleted,
             remote_storage_errors,
             ancestor_layers_deleted,
             controller_api_errors,
         } = other;
 
         self.indices_deleted += indices_deleted;
+        self.tenant_manifests_deleted += tenant_manifests_deleted;
         self.remote_storage_errors += remote_storage_errors;
         self.ancestor_layers_deleted += ancestor_layers_deleted;
         self.controller_api_errors += controller_api_errors;
@@ -352,6 +359,69 @@ async fn maybe_delete_index(
     }
 }
 
+async fn maybe_delete_tenant_manifest(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    latest_gen: Generation,
+    obj: &ListingObject,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) {
+    // Validation: we will only delete things that parse cleanly
+    let basename = obj.key.get_path().file_name().unwrap();
+    let Some(candidate_generation) =
+        parse_remote_tenant_manifest_path(RemotePath::from_string(basename).unwrap())
+    else {
+        // A strange key: we will not delete this because we don't understand it.
+        tracing::warn!("Bad index key");
+        return;
+    };
+
+    // Validation: we will only delete manifests more than one generation old, and in fact we
+    // should never be called with such recent generations.
+    if candidate_generation >= latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
+        return;
+    } else if candidate_generation.next() == latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation - 1, this is a bug!");
+        return;
+    }
+
+    if !is_old_enough(min_age, obj, summary) {
+        return;
+    }
+
+    if matches!(mode, GcMode::DryRun) {
+        tracing::info!("Dry run: would delete this key");
+        return;
+    }
+
+    // All validations passed: erase the object
+    let cancel = CancellationToken::new();
+    match backoff::retry(
+        || remote_client.delete(&obj.key, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "maybe_delete_tenant_manifest",
+        &cancel,
+    )
+    .await
+    {
+        None => {
+            unreachable!("Using a dummy cancellation token");
+        }
+        Some(Ok(_)) => {
+            tracing::info!("Successfully deleted tenant manifest");
+            summary.tenant_manifests_deleted += 1;
+        }
+        Some(Err(e)) => {
+            tracing::warn!("Failed to delete tenant manifest: {e}");
+            summary.remote_storage_errors += 1;
+        }
+    }
+}
+
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
     remote_client: &GenericRemoteStorage,
@@ -451,13 +521,100 @@ async fn gc_ancestor(
     Ok(())
 }
 
+async fn gc_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    min_age: Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    tenant_shard_id: TenantShardId,
+) -> anyhow::Result<GcSummary> {
+    let mut gc_summary = GcSummary::default();
+    match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
+        ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys: _,
+        } => {
+            for (_key, error) in errors {
+                tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
+            }
+        }
+        ListTenantManifestResult::NoErrors(mut manifest_info) => {
+            let Some(latest_gen) = manifest_info.latest_generation else {
+                return Ok(gc_summary);
+            };
+            manifest_info
+                .manifests
+                .sort_by_key(|(generation, _obj)| *generation);
+            // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
+            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            for (_generation, key) in candidates {
+                maybe_delete_tenant_manifest(
+                    remote_client,
+                    &min_age,
+                    latest_gen,
+                    key,
+                    mode,
+                    &mut gc_summary,
+                )
+                .instrument(
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                )
+                .await;
+            }
+        }
+    }
+    Ok(gc_summary)
+}
+
+async fn gc_timeline(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    ttid: TenantShardTimelineId,
+    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+) -> anyhow::Result<GcSummary> {
+    let mut summary = GcSummary::default();
+    let data = list_timeline_blobs(remote_client, ttid, target).await?;
+
+    let (index_part, latest_gen, candidates) = match &data.blob_data {
+        BlobDataParseResult::Parsed {
+            index_part,
+            index_part_generation,
+            s3_layers: _s3_layers,
+        } => (index_part, *index_part_generation, data.unused_index_keys),
+        BlobDataParseResult::Relic => {
+            // Post-deletion tenant location: don't try and GC it.
+            return Ok(summary);
+        }
+        BlobDataParseResult::Incorrect {
+            errors,
+            s3_layers: _,
+        } => {
+            // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+            tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
+            return Ok(summary);
+        }
+    };
+
+    accumulator.lock().unwrap().update(ttid, index_part);
+
+    for key in candidates {
+        maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+            .await;
+    }
+
+    Ok(summary)
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
 /// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
-/// - Index objects from historic generations
+/// - Index objects and tenant manifests from historic generations
 ///
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
@@ -470,6 +627,7 @@ pub async fn pageserver_physical_gc(
 ) -> anyhow::Result<GcSummary> {
     let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
+    let remote_client = Arc::new(remote_client);
     let tenants = if tenant_shard_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
@@ -484,59 +642,59 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
-    let timelines = timelines.try_buffered(CONCURRENCY);
-    let timelines = timelines.try_flatten();
-
-    // Generate a stream of S3TimelineBlobData
-    async fn gc_timeline(
-        remote_client: &GenericRemoteStorage,
-        min_age: &Duration,
-        target: &RootTarget,
-        mode: GcMode,
-        ttid: TenantShardTimelineId,
-        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
-    ) -> anyhow::Result<GcSummary> {
-        let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(remote_client, ttid, target).await?;
-
-        let (index_part, latest_gen, candidates) = match &data.blob_data {
-            BlobDataParseResult::Parsed {
-                index_part,
-                index_part_generation,
-                s3_layers: _s3_layers,
-            } => (index_part, *index_part_generation, data.unused_index_keys),
-            BlobDataParseResult::Relic => {
-                // Post-deletion tenant location: don't try and GC it.
-                return Ok(summary);
-            }
-            BlobDataParseResult::Incorrect {
-                errors,
-                s3_layers: _,
-            } => {
-                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
-                return Ok(summary);
-            }
-        };
-
-        accumulator.lock().unwrap().update(ttid, index_part);
-
-        for key in candidates {
-            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
-                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
-                .await;
-        }
-
-        Ok(summary)
+    enum GcSummaryOrContent<T> {
+        Content(T),
+        GcSummary(GcSummary),
     }
+    let timelines = tenants.map_ok(|tenant_shard_id| {
+        let target_ref = &target;
+        let remote_client_ref = &remote_client;
+        async move {
+            let summaries_from_manifests = match gc_tenant_manifests(
+                remote_client_ref,
+                min_age,
+                target_ref,
+                mode,
+                tenant_shard_id,
+            )
+            .await
+            {
+                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
+                    gc_summary,
+                ))],
+                Err(e) => {
+                    tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
+                    Vec::new()
+                }
+            };
+            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
+                .await
+                .map(|stream| {
+                    stream
+                        .map_ok(GcSummaryOrContent::Content)
+                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                })
+        }
+    });
+    let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+    let timelines = timelines.try_flatten();
 
     let mut summary = GcSummary::default();
 
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
-        let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
+        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
+            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
+                &remote_client,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )),
+            GcSummaryOrContent::GcSummary(gc_summary) => {
+                futures::future::Either::Right(futures::future::ok(gc_summary))
+            }
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 5a1e493bbe..e808dd1396 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -835,3 +835,117 @@ def test_timeline_retain_lsn(
         with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
             sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
             assert sum == pre_branch_sum
+
+
+def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
+    """
+    Test for scrubber deleting old generations of manifests
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+        }
+    )
+
+    # Create a branch and archive it
+    child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,512)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+        last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def child_offloaded():
+        ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
+        assert timeline_offloaded_api(child_timeline_id)
+
+    wait_until(child_offloaded)
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    # Reboot the pageserver a bunch of times, do unoffloads, offloads
+    for i in range(5):
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        assert timeline_offloaded_api(child_timeline_id)
+        assert not timeline_offloaded_api(root_timeline_id)
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+        assert not timeline_offloaded_api(child_timeline_id)
+
+        if i % 2 == 0:
+            with env.endpoints.create_start(
+                "test_archived_branch_persisted", tenant_id=tenant_id
+            ) as endpoint:
+                sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+                assert sum == sum_again
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        wait_until(child_offloaded)
+
+    #
+    # Now ensure that scrubber runs will clean up old generations' manifests.
+    #
+
+    # Sleep some amount larger than min_age_secs
+    time.sleep(3)
+
+    # Ensure that min_age_secs has a deletion impeding effect
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["tenant_manifests_deleted"] == 0
+
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] > 0
+    assert gc_summary["tenant_manifests_deleted"] > 0

From 944c1adc4ce5534b90306b217de3d71b282d07fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 23:07:03 +0100
Subject: [PATCH 048/117] tests & benchmarks: unify the way we customize the
 default tenant config (#9992)

Before this PR, some override callbacks used `.default()`, others
used `.setdefault()`.

As of this PR, all callbacks use `.setdefault()` which I think is least
prone to failure.

Aligning on a single way will set the right example for future tests
that need such customization.

The `test_pageserver_getpage_throttle.py` technically is a change in
behavior: before, it replaced the `tenant_config` field, now it just
configures the throttle. This is what I believe is intended anyway.
---
 test_runner/performance/test_branch_creation.py         | 3 +--
 test_runner/regress/test_disk_usage_eviction.py         | 3 +--
 test_runner/regress/test_pageserver_getpage_throttle.py | 3 ++-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 3ce27d6cd3..cf2212d447 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -142,10 +142,9 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
     def patch_default_tenant_config(config):
-        tenant_config = config.get("tenant_config", {})
+        tenant_config = config.setdefault("tenant_config", {})
         tenant_config["compaction_period"] = "3s"
         tenant_config["gc_period"] = "0s"
-        config["tenant_config"] = tenant_config
 
     env.pageserver.edit_config_toml(patch_default_tenant_config)
     env.pageserver.start(
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 05956b5b93..954db914b9 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -62,9 +62,8 @@ def test_min_resident_size_override_handling(
     if config_level_override is not None:
 
         def set_min_resident_size(config):
-            tenant_config = config.get("tenant_config", {})
+            tenant_config = config.setdefault("tenant_config", {})
             tenant_config["min_resident_size_override"] = config_level_override
-            config["tenant_config"] = tenant_config
 
         env.pageserver.edit_config_toml(set_min_resident_size)
     env.pageserver.stop()
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 6d0661f068..9644ebe3e2 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -183,7 +183,8 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
     """
 
     def set_tenant_config(ps_cfg):
-        ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
 
     neon_env_builder.pageserver_config_override = set_tenant_config
     env = neon_env_builder.init_start()

From 023821a80c68531a487d54353640d029d8b354f3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Dec 2024 22:46:18 +0000
Subject: [PATCH 049/117] test_page_service_batching: fix non-numeric metrics
 (#9998)

## Problem

```
2024-12-03T15:42:46.5978335Z + poetry run python /__w/neon/neon/scripts/ingest_perf_test_result.py --ingest /__w/neon/neon/test_runner/perf-report-local
2024-12-03T15:42:49.5325077Z Traceback (most recent call last):
2024-12-03T15:42:49.5325603Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 165, in <module>
2024-12-03T15:42:49.5326029Z     main()
2024-12-03T15:42:49.5326316Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 155, in main
2024-12-03T15:42:49.5326739Z     ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp)
2024-12-03T15:42:49.5327488Z                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2024-12-03T15:42:49.5327914Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 99, in ingest_perf_test_result
2024-12-03T15:42:49.5328321Z     psycopg2.extras.execute_values(
2024-12-03T15:42:49.5328940Z   File "/github/home/.cache/pypoetry/virtualenvs/non-package-mode-_pxWMzVK-py3.11/lib/python3.11/site-packages/psycopg2/extras.py", line 1299, in execute_values
2024-12-03T15:42:49.5335618Z     cur.execute(b''.join(parts))
2024-12-03T15:42:49.5335967Z psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type numeric: "concurrent-futures"
2024-12-03T15:42:49.5336287Z LINE 57:             'concurrent-futures',
2024-12-03T15:42:49.5336462Z                      ^
```

## Summary of changes
- `test_page_service_batching`: save non-numeric params as `labels`
- Add a runtime check that `metric_value` is NUMERIC
---
 test_runner/fixtures/benchmark_fixture.py             | 10 ++++++++++
 .../pageserver/test_page_service_batching.py          | 11 ++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index bb8e75902e..fa3747c08f 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -266,6 +266,16 @@ class NeonBenchmarker:
         name = f"{self.PROPERTY_PREFIX}_{metric_name}"
         if labels is None:
             labels = {}
+
+        # Sometimes mypy can't catch non-numeric values,
+        # so adding a check here
+        try:
+            float(metric_value)
+        except ValueError as e:
+            raise ValueError(
+                f"`metric_value` (`{metric_value}`) must be a NUMERIC-friendly data type"
+            ) from e
+
         self.property_recorder(
             name,
             {
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 562094a059..2c27368001 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -116,21 +116,18 @@ def test_throughput(
             # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
         }
     )
-    params.update(
-        {
-            f"pipelining_config.{k}": (v, {})
-            for k, v in dataclasses.asdict(pipelining_config).items()
-        }
-    )
+    # For storing configuration as a metric, insert a fake 0 with labels with actual data
+    params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})})
 
     log.info("params: %s", params)
 
     for param, (value, kwargs) in params.items():
         zenbenchmark.record(
             param,
-            metric_value=value,
+            metric_value=float(value),
             unit=kwargs.pop("unit", ""),
             report=MetricReport.TEST_PARAM,
+            labels=kwargs.pop("labels", None),
             **kwargs,
         )
 

From 8d93d02c2f3215226efbcbd65d71de82e0ade023 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 4 Dec 2024 01:07:49 +0100
Subject: [PATCH 050/117] page_service: enable batching in Rust & Python Tests
 + Python benchmarks (#9993)

This is the first step towards batching rollout.

Refs

- rollout plan: https://github.com/neondatabase/cloud/issues/20620
- task https://github.com/neondatabase/neon/issues/9377
- uber-epic: https://github.com/neondatabase/neon/issues/9376
---
 libs/pageserver_api/src/config.rs     |  9 ++++++++-
 test_runner/fixtures/neon_fixtures.py | 11 +++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e49d15ba87..09cfbc55fd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -442,7 +442,14 @@ impl Default for ConfigToml {
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: PageServicePipeliningConfig::Serial,
+            page_service_pipelining: if !cfg!(test) {
+                PageServicePipeliningConfig::Serial
+            } else {
+                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+                    max_batch_size: NonZeroUsize::new(32).unwrap(),
+                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                })
+            },
         }
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f55f06bebc..9c579373e8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1095,6 +1095,17 @@ class NeonEnv:
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
             }
+
+            # Batching (https://github.com/neondatabase/neon/issues/9377):
+            # enable batching by default in tests and benchmarks.
+            # Compat tests are exempt because old versions fail to parse the new config.
+            if not config.compatibility_neon_binpath:
+                ps_cfg["page_service_pipelining"] = {
+                    "mode": "pipelined",
+                    "execution": "concurrent-futures",
+                    "max_batch_size": 32,
+                }
+
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:

From 68205c48edab32f2d08523332a5a25af80eb0770 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Dec 2024 09:25:29 +0000
Subject: [PATCH 051/117] storcon: return an error for drain attempts while
 paused (#9997)

## Problem

We currently allow drain operations to proceed while the node policy is
paused.

## Summary of changes

Return a precondition failed error in such cases. The orchestrator is
updated in https://github.com/neondatabase/infra/pull/2544 to skip drain
and fills if the pageserver is paused.

Closes: https://github.com/neondatabase/neon/issues/9907
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 741d3dc2b4..92ec58cb4d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5681,7 +5681,7 @@ impl Service {
         }
 
         match node_policy {
-            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+            NodeSchedulingPolicy::Active => {
                 self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                     .await?;
 

From 1b3558df7a4fa3b6a44d7e5c5fd4c18fe4cd9acd Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:07:22 +0100
Subject: [PATCH 052/117] optimize parms for ingest bench (#9999)

## Problem

we tried different parallelism settings for ingest bench

## Summary of changes

the following settings seem optimal after merging
- SK side Wal filtering
- batched getpages

Settings:
- effective_io_concurrency 100
- concurrency limit 200 (different from Prod!)
- jobs 4, maintenance workers 7
- 10 GB chunk size
---
 .github/workflows/ingest_benchmark.yml                 |  1 +
 .../performance/test_perf_ingest_using_pgcopydb.py     | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 1033dc6489..a5810e91a4 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -26,6 +26,7 @@ concurrency:
 jobs:
   ingest:
     strategy:
+      fail-fast: false # allow other variants to continue even if one fails
       matrix:
         target_project: [new_empty_project, large_existing_project]  
     permissions:
diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 37f2e9db50..2f4574ba88 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
         "--no-acl",
         "--skip-db-properties",
         "--table-jobs",
-        "8",
+        "4",
         "--index-jobs",
-        "8",
+        "4",
         "--restore-jobs",
-        "8",
+        "4",
         "--split-tables-larger-than",
-        "5GB",
+        "10GB",
         "--skip-extensions",
         "--use-copy-binary",
         "--filters",
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16",
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From 9d75218ba7ad6340abfaa9a9bfbbe6fa443841f0 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:37:24 +0100
Subject: [PATCH 053/117] fix parsing human time output like "50m37s" (#10001)

## Problem

In ingest_benchmark.yml workflow we use pgcopydb tool to migrate
project.
pgcopydb logs human time.

Our parsing of the human time doesn't work for times like "50m37s".

[Example
workflow](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:479)

contains "57m45s"

but we
[reported](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:500)
only the seconds part:
45.000 s


## Summary of changes

add a regex pattern for Minute/Second combination
---
 test_runner/performance/test_perf_ingest_using_pgcopydb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 2f4574ba88..f0a0c1f5a2 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -184,7 +184,7 @@ def parse_log_and_report_metrics(
             for metric_name, pattern in metric_patterns.items():
                 if pattern.search(line):
                     # Extract duration and convert it to seconds
-                    duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line)
+                    duration_match = re.search(r"\d+h\d+m|\d+m\d+s|\d+s|\d+ms|\d+\.\d+s", line)
                     if duration_match:
                         duration_str = duration_match.group(0)
                         parts = re.findall(r"\d+[a-zA-Z]+", duration_str)

From 7b18e33997b861ce92bce9192007f87ab45708a9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 13:53:52 +0100
Subject: [PATCH 054/117] pageserver: return proper status code for
 heatmap_upload errors (#9991)

## Problem

During deploys, we see a lot of 500 errors due to heapmap uploads for
inactive tenants. These should be 503s instead.

Resolves #9574.

## Summary of changes

Make the secondary tenant scheduler use `ApiError` rather than
`anyhow::Error`, to propagate the tenant error and convert it to an
appropriate status code.
---
 pageserver/src/http/routes.rs                 | 28 ++++++++++++----
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/mgr.rs                  |  5 ++-
 pageserver/src/tenant/secondary.rs            | 33 +++++++++++++++----
 pageserver/src/tenant/secondary/downloader.rs | 13 ++++----
 .../src/tenant/secondary/heatmap_uploader.rs  | 10 +++---
 pageserver/src/tenant/secondary/scheduler.rs  |  4 +--
 7 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e127871549..e04f1460a8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -279,7 +279,10 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
     fn from(tse: GetTenantError) -> ApiError {
         match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()),
+            GetTenantError::ShardNotFound(tid) => {
+                ApiError::NotFound(anyhow!("tenant {tid}").into())
+            }
             GetTenantError::NotActive(_) => {
                 // Why is this not `ApiError::NotFound`?
                 // Because we must be careful to never return 404 for a tenant if it does
@@ -387,6 +390,16 @@ impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
     }
 }
 
+impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
+    fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError {
+        use crate::tenant::secondary::SecondaryTenantError;
+        match ste {
+            SecondaryTenantError::GetTenant(gte) => gte.into(),
+            SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown,
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
@@ -1047,9 +1060,11 @@ async fn timeline_delete_handler(
             match e {
                 // GetTenantError has a built-in conversion to ApiError, but in this context we don't
                 // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
+                GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => {
+                    ApiError::PreconditionFailed(
+                        "Requested tenant is missing".to_string().into_boxed_str(),
+                    )
+                }
                 e => e.into(),
             }
         })?;
@@ -2462,8 +2477,7 @@ async fn secondary_upload_handler(
     state
         .secondary_controller
         .upload_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -2578,7 +2592,7 @@ async fn secondary_download_handler(
         // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
         // okay.  We could get an error here in the unlikely edge case that the tenant
         // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        Ok(Err(e)) => return Err(e.into()),
         // A timeout is not an error: we have started the download, we're just not done
         // yet.  The caller will get a response body indicating status.
         Err(_) => StatusCode::ACCEPTED,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ada5c4a977..5a9e398586 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3422,7 +3422,7 @@ impl Tenant {
                             r.map_err(
                             |_e: tokio::sync::watch::error::RecvError|
                                 // Tenant existed but was dropped: report it as non-existent
-                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                                GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id))
                         )?
                         }
                         Err(TimeoutCancellableError::Cancelled) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 45481c4ed4..e8b0d1d4dd 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -894,7 +894,7 @@ impl TenantManager {
             Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
             Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
             None | Some(TenantSlot::Secondary(_)) => {
-                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+                Err(GetTenantError::ShardNotFound(tenant_shard_id))
             }
         }
     }
@@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError {
     #[error("Tenant {0} not found")]
     NotFound(TenantId),
 
+    #[error("Tenant {0} not found")]
+    ShardNotFound(TenantShardId),
+
     #[error("Tenant {0} is not active")]
     NotActive(TenantShardId),
 
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3df89a928c..4bc208331b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -22,6 +22,7 @@ use super::{
     mgr::TenantManager,
     span::debug_assert_current_span_has_tenant_id,
     storage_layer::LayerName,
+    GetTenantError,
 };
 
 use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
@@ -66,7 +67,21 @@ struct CommandRequest<T> {
 }
 
 struct CommandResponse {
-    result: anyhow::Result<()>,
+    result: Result<(), SecondaryTenantError>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum SecondaryTenantError {
+    #[error("{0}")]
+    GetTenant(GetTenantError),
+    #[error("shutting down")]
+    ShuttingDown,
+}
+
+impl From<GetTenantError> for SecondaryTenantError {
+    fn from(gte: GetTenantError) -> Self {
+        Self::GetTenant(gte)
+    }
 }
 
 // Whereas [`Tenant`] represents an attached tenant, this type represents the work
@@ -285,7 +300,7 @@ impl SecondaryController {
         &self,
         queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
         payload: T,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), SecondaryTenantError> {
         let (response_tx, response_rx) = tokio::sync::oneshot::channel();
 
         queue
@@ -294,20 +309,26 @@ impl SecondaryController {
                 response_tx,
             })
             .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         let response = response_rx
             .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         response.result
     }
 
-    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn upload_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn download_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(
             &self.download_req_tx,
             DownloadCommand::Download(tenant_shard_id),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8d771dc405..701e4cf04b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -35,7 +35,7 @@ use super::{
         self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
         TenantBackgroundJobs,
     },
-    SecondaryTenant,
+    GetTenantError, SecondaryTenant, SecondaryTenantError,
 };
 
 use crate::tenant::{
@@ -470,15 +470,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         result
     }
 
-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+    fn on_command(
+        &mut self,
+        command: DownloadCommand,
+    ) -> Result<PendingDownload, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         let tenant = self
             .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-        };
+            .get_secondary_tenant_shard(*tenant_shard_id)
+            .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?;
 
         Ok(PendingDownload {
             target_time: None,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index e680fd705b..c5e5e04945 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -28,7 +28,7 @@ use super::{
         self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
         TenantBackgroundJobs,
     },
-    CommandRequest, UploadCommand,
+    CommandRequest, SecondaryTenantError, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
@@ -279,7 +279,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
 
-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+    fn on_command(
+        &mut self,
+        command: UploadCommand,
+    ) -> Result<UploadPending, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         tracing::info!(
@@ -287,8 +290,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             "Starting heatmap write on command");
         let tenant = self
             .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .get_attached_tenant_shard(*tenant_shard_id)?;
         if !tenant.is_active() {
             return Err(GetTenantError::NotActive(*tenant_shard_id).into());
         }
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 28cf2125df..e963c722b9 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -12,7 +12,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::{completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{CommandRequest, CommandResponse};
+use super::{CommandRequest, CommandResponse, SecondaryTenantError};
 
 /// Scheduling interval is the time between calls to JobGenerator::schedule.
 /// When we schedule jobs, the job generator may provide a hint of its preferred
@@ -112,7 +112,7 @@ where
 
     /// Called when a command is received.  A job will be spawned immediately if the return
     /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+    fn on_command(&mut self, cmd: CMD) -> Result<PJ, SecondaryTenantError>;
 }
 
 /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling

From dcd016bbfc10666e10fcd9f9f2bce93a2ec2f1f9 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 4 Dec 2024 13:58:31 +0100
Subject: [PATCH 055/117] Assign /libs/proxy/ to proxy team (#10003)

---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 21b0e7c51f..f41462c98b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,7 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
+/libs/proxy/ @neondatabase/proxy
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/storage
 /libs/vm_monitor/ @neondatabase/autoscaling

From bd52822e14cd1c62ad4f39ac599964a645c4aa32 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 12:58:35 +0000
Subject: [PATCH 056/117] feat(proxy): add option to forward startup params
 (#9979)

(stacked on #9990 and #9995)

Partially fixes #1287 with a custom option field to enable the fixed
behaviour. This allows us to gradually roll out the fix without silently
changing the observed behaviour for our customers.

related to https://github.com/neondatabase/cloud/issues/15284
---
 Cargo.lock                                    |   4 +-
 Cargo.toml                                    |   2 +-
 libs/pq_proto/src/lib.rs                      |   2 +-
 .../src/authentication/sasl.rs                |   4 +-
 .../src/message/frontend.rs                   |  28 ++-
 libs/proxy/tokio-postgres2/src/codec.rs       |  13 +-
 libs/proxy/tokio-postgres2/src/config.rs      | 193 +++---------------
 libs/proxy/tokio-postgres2/src/connect.rs     |  49 +----
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  31 +--
 proxy/src/cancellation.rs                     |  11 +-
 proxy/src/compute.rs                          |  96 ++++-----
 proxy/src/console_redirect_proxy.rs           |   1 +
 proxy/src/proxy/connect_compute.rs            |   4 +-
 proxy/src/proxy/mod.rs                        |  40 +++-
 proxy/src/proxy/tests/mitm.rs                 |   8 +-
 proxy/src/proxy/tests/mod.rs                  |   2 +-
 proxy/src/serverless/backend.rs               |  11 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_proxy.py             |  19 ++
 19 files changed, 180 insertions(+), 340 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5b80ec5e93..38158b7aec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 91fa6a2607..a35823e0c2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -74,7 +74,7 @@ bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
-bytes = "1.0"
+bytes = "1.9"
 camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 43dfbc22a4..94714359a3 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder {
 
 #[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    pub params: Bytes,
 }
 
 impl StartupMessageParams {
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index 19aa3c1e9a..f2200a40ce 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -117,7 +117,7 @@ enum Credentials<const N: usize> {
     /// A regular password as a vector of bytes.
     Password(Vec<u8>),
     /// A precomputed pair of keys.
-    Keys(Box<ScramKeys<N>>),
+    Keys(ScramKeys<N>),
 }
 
 enum State {
@@ -176,7 +176,7 @@ impl ScramSha256 {
 
     /// Constructs a new instance which will use the provided key pair for authentication.
     pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
-        let password = Credentials::Keys(keys.into());
+        let password = Credentials::Keys(keys);
         ScramSha256::new_inner(password, channel_binding, nonce())
     }
 
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
index 5d0a8ff8c8..bc6168f337 100644
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) {
 }
 
 #[inline]
-pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
-where
-    I: IntoIterator<Item = (&'a str, &'a str)>,
-{
+pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> {
     write_body(buf, |buf| {
         // postgres protocol version 3.0(196608) in bigger-endian
         buf.put_i32(0x00_03_00_00);
-        for (key, value) in parameters {
-            write_cstr(key.as_bytes(), buf)?;
-            write_cstr(value.as_bytes(), buf)?;
-        }
+        buf.put_slice(&parameters.params);
         buf.put_u8(0);
         Ok(())
     })
 }
 
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct StartupMessageParams {
+    pub params: BytesMut,
+}
+
+impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        if name.contains('\0') || value.contains('\0') {
+            panic!("startup parameter name or value contained a null")
+        }
+        self.params.put_slice(name.as_bytes());
+        self.params.put_u8(0);
+        self.params.put_slice(value.as_bytes());
+        self.params.put_u8(0);
+    }
+}
+
 #[inline]
 pub fn sync(buf: &mut BytesMut) {
     buf.put_u8(b'S');
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
index 7412db785b..0ec46198ce 100644
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages {
     }
 }
 
-pub struct PostgresCodec {
-    pub max_message_size: Option<usize>,
-}
+pub struct PostgresCodec;
 
 impl Encoder<FrontendMessage> for PostgresCodec {
     type Error = io::Error;
@@ -66,15 +64,6 @@ impl Decoder for PostgresCodec {
                 break;
             }
 
-            if let Some(max) = self.max_message_size {
-                if len > max {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        "message too large",
-                    ));
-                }
-            }
-
             match header.tag() {
                 backend::NOTICE_RESPONSE_TAG
                 | backend::NOTIFICATION_RESPONSE_TAG
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index fd10ef6f20..11a361a81b 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,6 +6,7 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
+use postgres_protocol2::message::frontend::StartupMessageParams;
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -14,16 +15,6 @@ use tokio::io::{AsyncRead, AsyncWrite};
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;
 
-/// Properties required of a session.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-#[non_exhaustive]
-pub enum TargetSessionAttrs {
-    /// No special properties are required.
-    Any,
-    /// The session must allow writes.
-    ReadWrite,
-}
-
 /// TLS configuration.
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 #[non_exhaustive]
@@ -73,94 +64,20 @@ pub enum AuthKeys {
 }
 
 /// Connection configuration.
-///
-/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
-///
-/// # Key-Value
-///
-/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
-/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
-///
-/// ## Keys
-///
-/// * `user` - The username to authenticate with. Required.
-/// * `password` - The password to authenticate with.
-/// * `dbname` - The name of the database to connect to. Defaults to the username.
-/// * `options` - Command line options used to configure the server.
-/// * `application_name` - Sets the `application_name` parameter on the server.
-/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
-///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
-/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
-///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
-///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
-///     with the `connect` method.
-/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
-///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
-///     omitted or the empty string.
-/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
-///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
-/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
-///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
-///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
-/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
-///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
-///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// host=localhost user=postgres connect_timeout=10 keepalives=0
-/// ```
-///
-/// ```not_rust
-/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
-/// ```
-///
-/// ```not_rust
-/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
-/// ```
-///
-/// # Url
-///
-/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
-/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
-/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
-/// as the path component of the URL specifies the database name.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// postgresql://user@localhost
-/// ```
-///
-/// ```not_rust
-/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
-/// ```
-///
-/// ```not_rust
-/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
-/// ```
-///
-/// ```not_rust
-/// postgresql:///mydb?user=user&host=/var/lib/postgresql
-/// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
     pub(crate) host: Host,
     pub(crate) port: u16,
 
-    pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
-    pub(crate) dbname: Option<String>,
-    pub(crate) options: Option<String>,
-    pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
     pub(crate) connect_timeout: Option<Duration>,
-    pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
-    pub(crate) replication_mode: Option<ReplicationMode>,
-    pub(crate) max_backend_message_size: Option<usize>,
+    pub(crate) server_params: StartupMessageParams,
+
+    database: bool,
+    username: bool,
 }
 
 impl Config {
@@ -169,18 +86,15 @@ impl Config {
         Config {
             host: Host::Tcp(host),
             port,
-            user: None,
             password: None,
             auth_keys: None,
-            dbname: None,
-            options: None,
-            application_name: None,
             ssl_mode: SslMode::Prefer,
             connect_timeout: None,
-            target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
-            replication_mode: None,
-            max_backend_message_size: None,
+            server_params: StartupMessageParams::default(),
+
+            database: false,
+            username: false,
         }
     }
 
@@ -188,14 +102,13 @@ impl Config {
     ///
     /// Required.
     pub fn user(&mut self, user: &str) -> &mut Config {
-        self.user = Some(user.to_string());
-        self
+        self.set_param("user", user)
     }
 
     /// Gets the user to authenticate with, if one has been configured with
     /// the `user` method.
-    pub fn get_user(&self) -> Option<&str> {
-        self.user.as_deref()
+    pub fn user_is_set(&self) -> bool {
+        self.username
     }
 
     /// Sets the password to authenticate with.
@@ -231,40 +144,26 @@ impl Config {
     ///
     /// Defaults to the user.
     pub fn dbname(&mut self, dbname: &str) -> &mut Config {
-        self.dbname = Some(dbname.to_string());
-        self
+        self.set_param("database", dbname)
     }
 
     /// Gets the name of the database to connect to, if one has been configured
     /// with the `dbname` method.
-    pub fn get_dbname(&self) -> Option<&str> {
-        self.dbname.as_deref()
+    pub fn db_is_set(&self) -> bool {
+        self.database
     }
 
-    /// Sets command line options used to configure the server.
-    pub fn options(&mut self, options: &str) -> &mut Config {
-        self.options = Some(options.to_string());
+    pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config {
+        if name == "database" {
+            self.database = true;
+        } else if name == "user" {
+            self.username = true;
+        }
+
+        self.server_params.insert(name, value);
         self
     }
 
-    /// Gets the command line options used to configure the server, if the
-    /// options have been set with the `options` method.
-    pub fn get_options(&self) -> Option<&str> {
-        self.options.as_deref()
-    }
-
-    /// Sets the value of the `application_name` runtime parameter.
-    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
-        self.application_name = Some(application_name.to_string());
-        self
-    }
-
-    /// Gets the value of the `application_name` runtime parameter, if it has
-    /// been set with the `application_name` method.
-    pub fn get_application_name(&self) -> Option<&str> {
-        self.application_name.as_deref()
-    }
-
     /// Sets the SSL configuration.
     ///
     /// Defaults to `prefer`.
@@ -303,23 +202,6 @@ impl Config {
         self.connect_timeout.as_ref()
     }
 
-    /// Sets the requirements of the session.
-    ///
-    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
-    /// secondary servers. Defaults to `Any`.
-    pub fn target_session_attrs(
-        &mut self,
-        target_session_attrs: TargetSessionAttrs,
-    ) -> &mut Config {
-        self.target_session_attrs = target_session_attrs;
-        self
-    }
-
-    /// Gets the requirements of the session.
-    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
-        self.target_session_attrs
-    }
-
     /// Sets the channel binding behavior.
     ///
     /// Defaults to `prefer`.
@@ -333,28 +215,6 @@ impl Config {
         self.channel_binding
     }
 
-    /// Set replication mode.
-    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
-        self.replication_mode = Some(replication_mode);
-        self
-    }
-
-    /// Get replication mode.
-    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
-        self.replication_mode
-    }
-
-    /// Set limit for backend messages size.
-    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
-        self.max_backend_message_size = Some(max_backend_message_size);
-        self
-    }
-
-    /// Get limit for backend messages size.
-    pub fn get_max_backend_message_size(&self) -> Option<usize> {
-        self.max_backend_message_size
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -392,18 +252,13 @@ impl fmt::Debug for Config {
         }
 
         f.debug_struct("Config")
-            .field("user", &self.user)
             .field("password", &self.password.as_ref().map(|_| Redaction {}))
-            .field("dbname", &self.dbname)
-            .field("options", &self.options)
-            .field("application_name", &self.application_name)
             .field("ssl_mode", &self.ssl_mode)
             .field("host", &self.host)
             .field("port", &self.port)
             .field("connect_timeout", &self.connect_timeout)
-            .field("target_session_attrs", &self.target_session_attrs)
             .field("channel_binding", &self.channel_binding)
-            .field("replication", &self.replication_mode)
+            .field("server_params", &self.server_params)
             .finish()
     }
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 75a58e6eac..e0cb69748d 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,14 +1,11 @@
 use crate::client::SocketConfig;
 use crate::codec::BackendMessage;
-use crate::config::{Host, TargetSessionAttrs};
+use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
-use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use crate::{Client, Config, Connection, Error, RawConnection};
 use postgres_protocol2::message::backend::Message;
-use std::io;
-use std::task::Poll;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
 
@@ -72,47 +69,7 @@ where
         .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
         .collect();
 
-    let mut connection = Connection::new(stream, delayed, parameters, receiver);
-
-    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
-        let rows = client.simple_query_raw("SHOW transaction_read_only");
-        pin_mut!(rows);
-
-        let rows = future::poll_fn(|cx| {
-            if connection.poll_unpin(cx)?.is_ready() {
-                return Poll::Ready(Err(Error::closed()));
-            }
-
-            rows.as_mut().poll(cx)
-        })
-        .await?;
-        pin_mut!(rows);
-
-        loop {
-            let next = future::poll_fn(|cx| {
-                if connection.poll_unpin(cx)?.is_ready() {
-                    return Poll::Ready(Some(Err(Error::closed())));
-                }
-
-                rows.as_mut().poll_next(cx)
-            });
-
-            match next.await.transpose()? {
-                Some(SimpleQueryMessage::Row(row)) => {
-                    if row.try_get(0)? == Some("on") {
-                        return Err(Error::connect(io::Error::new(
-                            io::ErrorKind::PermissionDenied,
-                            "database does not allow writes",
-                        )));
-                    } else {
-                        break;
-                    }
-                }
-                Some(_) => {}
-                None => return Err(Error::unexpected_message()),
-            }
-        }
-    }
+    let connection = Connection::new(stream, delayed, parameters, receiver);
 
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 390f133002..66db85e07d 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -1,5 +1,5 @@
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::config::{self, AuthKeys, Config};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
@@ -96,12 +96,7 @@ where
     let stream = connect_tls(stream, config.ssl_mode, tls).await?;
 
     let mut stream = StartupStream {
-        inner: Framed::new(
-            stream,
-            PostgresCodec {
-                max_message_size: config.max_backend_message_size,
-            },
-        ),
+        inner: Framed::new(stream, PostgresCodec),
         buf: BackendMessages::empty(),
         delayed_notice: Vec::new(),
     };
@@ -124,28 +119,8 @@ where
     S: AsyncRead + AsyncWrite + Unpin,
     T: AsyncRead + AsyncWrite + Unpin,
 {
-    let mut params = vec![("client_encoding", "UTF8")];
-    if let Some(user) = &config.user {
-        params.push(("user", &**user));
-    }
-    if let Some(dbname) = &config.dbname {
-        params.push(("database", &**dbname));
-    }
-    if let Some(options) = &config.options {
-        params.push(("options", &**options));
-    }
-    if let Some(application_name) = &config.application_name {
-        params.push(("application_name", &**application_name));
-    }
-    if let Some(replication_mode) = &config.replication_mode {
-        match replication_mode {
-            ReplicationMode::Physical => params.push(("replication", "true")),
-            ReplicationMode::Logical => params.push(("replication", "database")),
-        }
-    }
-
     let mut buf = BytesMut::new();
-    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+    frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
 
     stream
         .send(FrontendMessage::Raw(buf.freeze()))
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index bcb0ef40bd..7bc5587a25 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -70,11 +70,12 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
-        // expose it and we don't want to do another roundtrip to query
-        // for it. The client will be able to notice that this is not the
-        // actual backend_pid, but backend_pid is not used for anything
-        // so it doesn't matter.
+        // we intentionally generate a random "backend pid" and "secret key" here.
+        // we use the corresponding u64 as an identifier for the
+        // actual endpoint+pid+secret for postgres/pgbouncer.
+        //
+        // if we forwarded the backend_pid from postgres to the client, there would be a lot
+        // of overlap between our computes as most pids are small (~100).
         let key = loop {
             let key = rand::random();
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ab0ff4b795..4113b5bb80 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -131,49 +131,37 @@ impl ConnCfg {
     }
 
     /// Apply startup message params to the connection config.
-    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Console redirect auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
+    pub(crate) fn set_startup_params(
+        &mut self,
+        params: &StartupMessageParams,
+        arbitrary_params: bool,
+    ) {
+        if !arbitrary_params {
+            self.set_param("client_encoding", "UTF8");
         }
-
-        // Only set `dbname` if it's not present in the config.
-        // Console redirect auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use postgres_client::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        for (k, v) in params.iter() {
+            match k {
+                // Only set `user` if it's not present in the config.
+                // Console redirect auth flow takes username from the console's response.
+                "user" if self.user_is_set() => continue,
+                "database" if self.db_is_set() => continue,
+                "options" => {
+                    if let Some(options) = filtered_options(v) {
+                        self.set_param(k, &options);
+                    }
                 }
-                "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                "user" | "database" | "application_name" | "replication" => {
+                    self.set_param(k, v);
                 }
-                _other => {}
+
+                // if we allow arbitrary params, then we forward them through.
+                // this is a flag for a period of backwards compatibility
+                k if arbitrary_params => {
+                    self.set_param(k, v);
+                }
+                _ => {}
             }
         }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -347,10 +335,9 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -427,27 +414,24 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "";
+        assert_eq!(filtered_options(params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params).as_deref(), None);
 
-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
 
-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
 
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2 neon_proxy_params_compat:true";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 8f78df1964..7db1179eea 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -206,6 +206,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
         },
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 585dce7bae..a3027abd7c 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -66,6 +66,8 @@ pub(crate) trait ComputeConnectBackend {
 }
 
 pub(crate) struct TcpMechanism<'a> {
+    pub(crate) params_compat: bool,
+
     /// KV-dictionary with PostgreSQL connection params.
     pub(crate) params: &'a StartupMessageParams,
 
@@ -92,7 +94,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
+        config.set_startup_params(self.params, self.params_compat);
     }
 }
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index af97fb3d71..f74eb5940f 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -338,9 +338,17 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
+    let params_compat = match &user_info {
+        auth::Backend::ControlPlane(_, info) => {
+            info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some()
+        }
+        auth::Backend::Local(_) => false,
+    };
+
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
         },
@@ -409,19 +417,47 @@ pub(crate) async fn prepare_client_connection<P>(
 pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
+    // proxy options:
+
+    /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
+    const PARAMS_COMPAT: &str = "proxy_params_compat";
+
+    // cplane options:
+
+    /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
+    const LSN: &str = "lsn";
+
+    /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
+    const ENDPOINT_TYPE: &str = "endpoint_type";
+
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
             .options_raw()
             .map(Self::parse_from_iter)
             .unwrap_or_default()
     }
+
     pub(crate) fn parse_options_raw(options: &str) -> Self {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub(crate) fn get(&self, key: &str) -> Option<SmolStr> {
+        self.0
+            .iter()
+            .find_map(|(k, v)| (k == key).then_some(v))
+            .cloned()
+    }
+
     pub(crate) fn is_ephemeral(&self) -> bool {
-        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
-        !self.0.is_empty()
+        self.0.iter().any(|(k, _)| match &**k {
+            // This is not a cplane option, we know it does not create ephemeral computes.
+            Self::PARAMS_COMPAT => false,
+            Self::LSN => true,
+            Self::ENDPOINT_TYPE => true,
+            // err on the side of caution. any cplane options we don't know about
+            // might lead to ephemeral computes.
+            _ => true,
+        })
     }
 
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d72331c7bf..59c9ac27b8 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -55,7 +55,13 @@ async fn proxy_mitm(
 
         // give the end_server the startup parameters
         let mut buf = BytesMut::new();
-        frontend::startup_message(startup.iter(), &mut buf).unwrap();
+        frontend::startup_message(
+            &postgres_protocol::message::frontend::StartupMessageParams {
+                params: startup.params.into(),
+            },
+            &mut buf,
+        )
+        .unwrap();
         end_server.send(buf.freeze()).await.unwrap();
 
         // proxy messages between end_client and end_server
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 53345431e3..911b349416 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -252,7 +252,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
     let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
-        .options("project=generic-project-name")
+        .set_param("options", "project=generic-project-name")
         .ssl_mode(SslMode::Prefer)
         .connect_raw(server, NoTls)
         .await?;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 55d2e47fd3..251aa47084 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -309,10 +309,13 @@ impl PoolingBackend {
             .config
             .user(&conn_info.user_info.user)
             .dbname(&conn_info.dbname)
-            .options(&format!(
-                "-c pg_session_jwt.jwk={}",
-                serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
-            ));
+            .set_param(
+                "options",
+                &format!(
+                    "-c pg_session_jwt.jwk={}",
+                    serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
+                ),
+            );
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(postgres_client::NoTls).await?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9c579373e8..60c4a23936 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -269,7 +269,7 @@ class PgProtocol:
             for match in re.finditer(r"-c(\w*)=(\w*)", options):
                 key = match.group(1)
                 val = match.group(2)
-                if "server_options" in conn_options:
+                if "server_settings" in conn_options:
                     conn_options["server_settings"].update({key: val})
                 else:
                     conn_options["server_settings"] = {key: val}
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 5a01d90d85..d8df2efc78 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,6 +5,7 @@ import json
 import subprocess
 import time
 import urllib.parse
+from contextlib import closing
 from typing import TYPE_CHECKING
 
 import psycopg2
@@ -131,6 +132,24 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     assert out[0][0] == " str"
 
 
+@pytest.mark.asyncio
+async def test_proxy_arbitrary_params(static_proxy: NeonProxy):
+    with closing(
+        await static_proxy.connect_async(server_settings={"IntervalStyle": "iso_8601"})
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"00:00:00"'
+
+    options = "neon_proxy_params_compat:true"
+    with closing(
+        await static_proxy.connect_async(
+            server_settings={"IntervalStyle": "iso_8601", "options": options}
+        )
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"PT0S"'
+
+
 def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.

From 9a4157dadbf463ce479d68f3663824b4400d7f9a Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 4 Dec 2024 14:05:31 +0100
Subject: [PATCH 057/117] feat(compute): Set default application_name for
 pgbouncer connections (#9973)

## Problem

When client specifies `application_name`, pgbouncer propagates it to the
Postgres. Yet, if client doesn't do it, we have hard time figuring out
who opens a lot of Postgres connections (including the `cloud_admin`
ones).

See this investigation as an example:
https://neondb.slack.com/archives/C0836R0RZ0D

## Summary of changes

I haven't found this documented, but it looks like pgbouncer accepts
standard Postgres connstring parameters in the connstring in the
`[databases]` section, so put the default `application_name=pgbouncer`
there. That way, we will always see who opens Postgres connections. I
did tests, and if client specifies a `application_name`, pgbouncer
overrides this default, so it only works if it's not specified or set to
blank `&application_name=` in the connection string.

This is the last place we could potentially open some Postgres
connections without `application_name`. Everything else should be either
of two:
1. Direct client connections without `application_name`, but these
should be strictly non-`cloud_admin` ones
2. Some ad-hoc internal connections, so if we see spikes of unidentified
`cloud_admin` connections, we will need to investigate it again.

Fixes neondatabase/cloud#20948
---
 compute/etc/pgbouncer.ini | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index cb994f961c..abcd165636 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -1,5 +1,9 @@
 [databases]
-*=host=localhost port=5432 auth_user=cloud_admin
+;; pgbouncer propagates application_name (if it's specified) to the server, but some
+;; clients don't set it. We set default application_name=pgbouncer to make it
+;; easier to identify pgbouncer connections in Postgres. If client sets
+;; application_name, it will be used instead.
+*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer
 [pgbouncer]
 listen_port=6432
 listen_addr=0.0.0.0

From 699a213c5d8684d4a78bf78af47a790c00921384 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 14:05:53 +0100
Subject: [PATCH 058/117] Display reqwest error source (#10004)

## Problem

Reqwest errors don't include details about the inner source error. This
means that we get opaque errors like:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config)
```

Instead of the more helpful:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config): operation timed out
```

Touches #9801.

## Summary of changes

Include the source error for `reqwest::Error` wherever it's displayed.
---
 control_plane/src/safekeeper.rs              |  3 ++-
 pageserver/client/src/mgmt_api.rs            |  6 +++---
 pageserver/src/consumption_metrics/upload.rs |  7 ++++++-
 safekeeper/src/http/client.rs                |  3 ++-
 storage_controller/src/compute_hook.rs       |  3 ++-
 storage_controller/src/peer_client.rs        | 11 ++++++++---
 storage_scrubber/src/cloud_admin_api.rs      | 14 ++++++++++----
 7 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 7a019bce88..f0c3722925 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::error::Error as _;
 use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
@@ -26,7 +27,7 @@ use crate::{
 
 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
-    #[error("Reqwest error: {0}")]
+    #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Transport(#[from] reqwest::Error),
 
     #[error("Error: {0}")]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4d76c66905..c3a1ef8140 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, error::Error as _};
 
 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -25,10 +25,10 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
+    #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendRequest(reqwest::Error),
 
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     #[error("receive error body: {0}")]
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 1cb4e917c0..448bf47525 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::time::SystemTime;
 
 use chrono::{DateTime, Utc};
@@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError {
 
         match self {
             Rejected(code) => write!(f, "server rejected the metrics with {code}"),
-            Reqwest(e) => write!(f, "request failed: {e}"),
+            Reqwest(e) => write!(
+                f,
+                "request failed: {e}{}",
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
+            ),
             Cancelled => write!(f, "cancelled"),
         }
     }
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index c56f7880d4..a166fc1ab9 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -8,6 +8,7 @@
 //! etc.
 
 use reqwest::{IntoUrl, Method, StatusCode};
+use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
     id::{NodeId, TenantId, TimelineId},
@@ -26,7 +27,7 @@ pub struct Client {
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
     /// Failed to receive body (reqwest error).
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     /// Status is not ok, but failed to parse body as `HttpErrorBody`.
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index b63a322b87..2b2ece3f02 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 
@@ -172,7 +173,7 @@ struct ComputeHookNotifyRequest {
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum NotifyError {
     // Request was not send successfully, e.g. transport error
-    #[error("Sending request: {0}")]
+    #[error("Sending request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Request(#[from] reqwest::Error),
     // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
     #[error("Control plane tenant busy")]
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index 3f8520fe55..ee4eb55294 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,9 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
+use std::error::Error as _;
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -17,11 +19,14 @@ pub(crate) struct PeerClient {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum StorageControllerPeerError {
-    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    #[error(
+        "failed to deserialize error response with status code {0} at {1}: {2}{}",
+        .2.source().map(|e| format!(": {e}")).unwrap_or_default()
+    )]
     DeserializationError(StatusCode, Url, reqwest::Error),
     #[error("storage controller peer API error ({0}): {1}")]
     ApiError(StatusCode, String),
-    #[error("failed to send HTTP request: {0}")]
+    #[error("failed to send HTTP request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendError(reqwest::Error),
     #[error("Cancelled")]
     Cancelled,
diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index c9a62cd256..b1dfe3a53f 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -1,3 +1,5 @@
+use std::error::Error as _;
+
 use chrono::{DateTime, Utc};
 use futures::Future;
 use hex::FromHex;
@@ -30,14 +32,18 @@ impl std::fmt::Display for Error {
         match &self.kind {
             ErrorKind::RequestSend(e) => write!(
                 f,
-                "Failed to send a request. Context: {}, error: {}",
-                self.context, e
+                "Failed to send a request. Context: {}, error: {}{}",
+                self.context,
+                e,
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
             ),
             ErrorKind::BodyRead(e) => {
                 write!(
                     f,
-                    "Failed to read a request body. Context: {}, error: {}",
-                    self.context, e
+                    "Failed to read a request body. Context: {}, error: {}{}",
+                    self.context,
+                    e,
+                    e.source().map(|e| format!(": {e}")).unwrap_or_default()
                 )
             }
             ErrorKind::ResponseStatus(status) => {

From dec2e2fb2997225be9f99687a8829a1e9c473313 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:10:00 +0100
Subject: [PATCH 059/117] Create a branch for compute release (#9637)

## Problem
We practice a manual release flow for the compute module. This will
allow automation of the compute release process.

## Summary of changes
The workflow was modified to make a compute release automatically on the
branch release-compute.
## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../actions/allure-report-generate/action.yml |  3 ++-
 .../actions/allure-report-store/action.yml    |  3 ++-
 .github/workflows/_create-release-pr.yml      |  2 +-
 .github/workflows/build_and_test.yml          | 23 +++++++++++--------
 .github/workflows/release.yml                 | 23 ++++++++++++++++---
 .github/workflows/trigger-e2e-tests.yml       |  2 ++
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index d1d09223db..d6219c31b4 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -43,7 +43,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 9c376f420a..3c83656c89 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -23,7 +23,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
index cc6994397f..3c130c8229 100644
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -21,7 +21,7 @@ defaults:
     shell: bash -euo pipefail {0}
 
 jobs:
-  create-storage-release-branch:
+  create-release-branch:
     runs-on: ubuntu-22.04
 
     permissions:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e9e111e7bd..cb966f292e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -6,6 +6,7 @@ on:
       - main
       - release
       - release-proxy
+      - release-compute
   pull_request:
 
 defaults:
@@ -70,8 +71,10 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
             echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
           fi
         shell: bash
@@ -513,7 +516,7 @@ jobs:
             })
 
   trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
     needs: [ check-permissions, promote-images, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
@@ -934,7 +937,7 @@ jobs:
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
       - name: Configure AWS-prod credentials
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-region: eu-central-1
@@ -943,12 +946,12 @@ jobs:
 
       - name: Login to prod ECR
         uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
 
       - name: Copy all images to prod ECR
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
           for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
@@ -968,7 +971,7 @@ jobs:
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -1056,7 +1059,7 @@ jobs:
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1105,13 +1108,15 @@ jobs:
               -f deployProxyAuthBroker=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
             exit 1
           fi
 
       - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 11f010b6d4..f0273b977f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,10 @@ on:
         type: boolean
         description: 'Create Proxy release PR'
         required: false
+      create-compute-release-branch:
+        type: boolean
+        description: 'Create Compute release PR'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -25,20 +29,20 @@ defaults:
 
 jobs:
   create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
 
     permissions:
       contents: write
 
     uses: ./.github/workflows/_create-release-pr.yml
     with:
-      component-name: 'Storage & Compute'
+      component-name: 'Storage'
       release-branch: 'release'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }}
 
     permissions:
       contents: write
@@ -49,3 +53,16 @@ jobs:
       release-branch: 'release-proxy'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+
+  create-compute-release-branch:
+    if: inputs.create-compute-release-branch
+
+    permissions:
+      contents: write
+
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Compute'
+      release-branch: 'release-compute'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 1e7264c55a..70c2e8549f 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

From 60c0d19f57c160be46ae364e139a2063d7741522 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Dec 2024 15:04:04 +0000
Subject: [PATCH 060/117] tests: make storcon scale test AZ-aware (#9952)

## Problem

We have a scale test for the storage controller which also acts as a
good stress test for scheduling stability. However, it created nodes
with no AZs set.

## Summary of changes

- Bump node count to 6 and set AZs on them.

This is a precursor to other AZ-related PRs, to make sure any new code
that's landed is getting scale tested in an AZ-aware environment.
---
 test_runner/performance/test_storage_controller_scale.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 142bd3d669..49f41483ec 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -72,7 +72,7 @@ def test_storage_controller_many_tenants(
     we don't fall over for a thousand shards.
     """
 
-    neon_env_builder.num_pageservers = 5
+    neon_env_builder.num_pageservers = 6
     neon_env_builder.storage_controller_config = {
         # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
         # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
@@ -84,6 +84,11 @@ def test_storage_controller_many_tenants(
         compute_reconfigure_listener.control_plane_compute_hook_api
     )
 
+    AZS = ["alpha", "bravo", "charlie"]
+    neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
+        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+    )
+
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 

From e6cd5050fcf9f699275b7adb7509efac0e3cd1b5 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:54:56 -0500
Subject: [PATCH 061/117] pageserver: make `BufferedWriter` do double-buffering
 (#9693)

Closes #9387.

## Problem

`BufferedWriter` cannot proceed while the owned buffer is flushing to
disk. We want to implement double buffering so that the flush can happen
in the background. See #9387.

## Summary of changes

- Maintain two owned buffers in `BufferedWriter`.
- The writer is in charge of copying the data into owned, aligned
buffer, once full, submit it to the flush task.
- The flush background task is in charge of flushing the owned buffer to
disk, and returned the buffer to the writer for reuse.
- The writer and the flush background task communicate through a
bi-directional channel.

For in-memory layer, we also need to be able to read from the buffered
writer in `get_values_reconstruct_data`. To handle this case, we did the
following
- Use replace `VirtualFile::write_all` with `VirtualFile::write_all_at`,
and use `Arc` to share it between writer and background task.
- leverage `IoBufferMut::freeze` to get a cheaply clonable `IoBuffer`,
one clone will be submitted to the channel, the other clone will be
saved within the writer to serve reads. When we want to reuse the
buffer, we can invoke `IoBuffer::into_mut`, which gives us back the
mutable aligned buffer.
- InMemoryLayer reads is now aware of the maybe_flushed part of the
buffer.

**Caveat**

- We removed the owned version of write, because this interface does not
work well with buffer alignment. The result is that without direct IO
enabled,
[`download_object`](https://github.com/neondatabase/neon/blob/a439d57050dafd603d24e001215213eb5246a029/pageserver/src/tenant/remote_timeline_client/download.rs#L243)
does one more memcpy than before this PR due to the switch to use
`_borrowed` version of the write.
- "Bypass aligned part of write" could be implemented later to avoid
large amount of memcpy.

**Testing**
- use an oneshot channel based control mechanism to make flush behavior
deterministic in test.
- test reading from `EphemeralFile` when the last submitted buffer is
not flushed, in-progress, and done flushing to disk.


## Performance


We see performance improvement for small values, and regression on big
values, likely due to being CPU bound + disk write latency.


[Results](https://www.notion.so/neondatabase/Benchmarking-New-BufferedWriter-11-20-2024-143f189e0047805ba99acda89f984d51?pvs=4)


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/sync.rs                        |   1 +
 libs/utils/src/sync/duplex.rs                 |   1 +
 libs/utils/src/sync/duplex/mpsc.rs            |  36 ++
 pageserver/benches/bench_ingest.rs            |   4 +-
 pageserver/src/tenant/ephemeral_file.rs       | 286 +++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +
 .../tenant/remote_timeline_client/download.rs |  44 ++-
 pageserver/src/tenant/secondary/downloader.rs |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |   5 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 pageserver/src/tenant/timeline.rs             |   3 +-
 .../src/tenant/timeline/layer_manager.rs      |  14 +-
 pageserver/src/virtual_file.rs                |  26 +-
 .../aligned_buffer/alignment.rs               |   4 +-
 .../owned_buffers_io/aligned_buffer/buffer.rs |  18 +-
 .../aligned_buffer/buffer_mut.rs              |  43 ++-
 .../owned_buffers_io/io_buf_aligned.rs        |  10 +-
 .../owned_buffers_io/io_buf_ext.rs            |  14 +
 .../util/size_tracking_writer.rs              |  50 ---
 .../virtual_file/owned_buffers_io/write.rs    | 358 +++++++++---------
 .../owned_buffers_io/write/flush.rs           | 314 +++++++++++++++
 21 files changed, 846 insertions(+), 389 deletions(-)
 create mode 100644 libs/utils/src/sync/duplex.rs
 create mode 100644 libs/utils/src/sync/duplex/mpsc.rs
 delete mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write/flush.rs

diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs
index 7aa26e24bc..280637de8f 100644
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,5 +1,6 @@
 pub mod heavier_once_cell;
 
+pub mod duplex;
 pub mod gate;
 
 pub mod spsc_fold;
diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs
new file mode 100644
index 0000000000..fac79297a0
--- /dev/null
+++ b/libs/utils/src/sync/duplex.rs
@@ -0,0 +1 @@
+pub mod mpsc;
diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs
new file mode 100644
index 0000000000..56b4e6d2b3
--- /dev/null
+++ b/libs/utils/src/sync/duplex/mpsc.rs
@@ -0,0 +1,36 @@
+use tokio::sync::mpsc;
+
+/// A bi-directional channel.
+pub struct Duplex<S, R> {
+    pub tx: mpsc::Sender<S>,
+    pub rx: mpsc::Receiver<R>,
+}
+
+/// Creates a bi-directional channel.
+///
+/// The channel will buffer up to the provided number of messages. Once the buffer is full,
+/// attempts to send new messages will wait until a message is received from the channel.
+/// The provided buffer capacity must be at least 1.
+pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
+    let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
+    let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
+
+    (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
+}
+
+impl<S: Send, R: Send> Duplex<S, R> {
+    /// Sends a value, waiting until there is capacity.
+    ///
+    /// A successful send occurs when it is determined that the other end of the channel has not hung up already.
+    pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
+        self.tx.send(x).await
+    }
+
+    /// Receives the next value for this receiver.
+    ///
+    /// This method returns `None` if the channel has been closed and there are
+    /// no remaining messages in the channel's buffer.
+    pub async fn recv(&mut self) -> Option<R> {
+        self.rx.recv().await
+    }
+}
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index caacd365b3..b67a9cc479 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -62,10 +62,8 @@ async fn ingest(
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
 
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size]));
     let data_ser_size = data.serialized_size().unwrap() as usize;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index de0abab4c0..aaec8a4c31 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -8,10 +8,8 @@ use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
 use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
@@ -20,6 +18,7 @@ use tracing::error;
 
 use std::io;
 use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
@@ -27,10 +26,7 @@ pub struct EphemeralFile {
     _timeline_id: TimelineId,
     page_cache_file_id: page_cache::FileId,
     bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
     _gate_guard: utils::sync::gate::GateGuard,
 }
@@ -42,9 +38,9 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
-    ) -> Result<EphemeralFile, io::Error> {
+    ) -> anyhow::Result<EphemeralFile> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
             NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -55,15 +51,17 @@ impl EphemeralFile {
                 "ephemeral-{filename_disambiguator}"
             )));
 
-        let file = VirtualFile::open_with_options(
-            &filename,
-            virtual_file::OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true),
-            ctx,
-        )
-        .await?;
+        let file = Arc::new(
+            VirtualFile::open_with_options_v2(
+                &filename,
+                virtual_file::OpenOptions::new()
+                    .read(true)
+                    .write(true)
+                    .create(true),
+                ctx,
+            )
+            .await?,
+        );
 
         let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
 
@@ -73,10 +71,12 @@ impl EphemeralFile {
             page_cache_file_id,
             bytes_written: 0,
             buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
+                file,
+                || IoBufferMut::with_capacity(TAIL_SZ),
+                gate.enter()?,
+                ctx,
             ),
-            _gate_guard: gate_guard,
+            _gate_guard: gate.enter()?,
         })
     }
 }
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
     fn drop(&mut self) {
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().as_inner().path();
+        let path = self.buffered_writer.as_inner().path();
         let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
@@ -132,6 +132,18 @@ impl EphemeralFile {
         srcbuf: &[u8],
         ctx: &RequestContext,
     ) -> std::io::Result<u64> {
+        let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
+        }
+        Ok(pos)
+    }
+
+    async fn write_raw_controlled(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
         let pos = self.bytes_written;
 
         let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
@@ -145,9 +157,9 @@ impl EphemeralFile {
         })?;
 
         // Write the payload
-        let nwritten = self
+        let (nwritten, control) = self
             .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
+            .write_buffered_borrowed_controlled(srcbuf, ctx)
             .await?;
         assert_eq!(
             nwritten,
@@ -157,7 +169,7 @@ impl EphemeralFile {
 
         self.bytes_written = new_bytes_written;
 
-        Ok(pos)
+        Ok((pos, control))
     }
 }
 
@@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
         dst: tokio_epoll_uring::Slice<B>,
         ctx: &'a RequestContext,
     ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
+        let submitted_offset = self.buffered_writer.bytes_submitted();
 
-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
+        let mutable = self.buffered_writer.inspect_mutable();
+        let mutable = &mutable[0..mutable.pending()];
+
+        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
 
         let dst_cap = dst.bytes_total().into_u64();
         let end = {
@@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
                 }
             }
         }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let (written_range, maybe_flushed_range) = {
+            if maybe_flushed.is_some() {
+                // [       written       ][ maybe_flushed ][    mutable    ]
+                //                        <-   TAIL_SZ   -><-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++????????????????>
+                (
+                    Range(
+                        start,
+                        std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                    ),
+                    Range(
+                        std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                        std::cmp::min(end, submitted_offset),
+                    ),
+                )
+            } else {
+                // [       written                        ][    mutable    ]
+                //                                         <-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++++++++++++++++++>
+                (
+                    Range(start, std::cmp::min(end, submitted_offset)),
+                    // zero len
+                    Range(submitted_offset, u64::MIN),
+                )
+            }
+        };
+
+        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
 
         let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let file: &VirtualFile = self.buffered_writer.as_inner();
             let bounds = dst.bounds();
             let slice = file
                 .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
@@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
+        let dst = if maybe_flushed_range.len() > 0 {
+            let offset_in_buffer = maybe_flushed_range
                 .0
-                .checked_sub(flushed_offset)
+                .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
                 .unwrap()
                 .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            // Checked previously the buffer is Some.
+            let maybe_flushed = maybe_flushed.unwrap();
+            let to_copy = &maybe_flushed
+                [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
             let bounds = dst.bounds();
             let mut view = dst.slice({
                 let start = written_range.len().into_usize();
                 let end = start
-                    .checked_add(buffered_range.len().into_usize())
+                    .checked_add(maybe_flushed_range.len().into_usize())
                     .unwrap();
                 start..end
             });
@@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
         // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
 
         Ok((dst, (end - start).into_usize()))
@@ -295,7 +363,7 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
             .await
             .unwrap();
 
@@ -326,14 +394,15 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
 
-        let write_nbytes = cap + cap / 2;
+        let write_nbytes = cap * 2 + cap / 2;
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
@@ -341,30 +410,39 @@ mod tests {
             .collect();
 
         let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+        for range in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+        {
+            let off = file.write_raw(&content[range], &ctx).await.unwrap();
             value_offsets.push(off);
         }
 
-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = IoBufferMut::with_capacity(1);
+        assert_eq!(file.len() as usize, write_nbytes);
+        for (i, range) in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+            .enumerate()
+        {
+            assert_eq!(value_offsets[i], range.start.into_u64());
+            let buf = IoBufferMut::with_capacity(range.len());
             let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
                 .await
                 .unwrap();
             let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
+            assert_eq!(nread, range.len());
+            assert_eq!(&buf, &content[range]);
         }
 
-        let file_contents =
-            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
+        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        assert!(file_contents == content[0..cap * 2]);
 
-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
+        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
+
+        let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
+        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
     }
 
     #[tokio::test]
@@ -373,16 +451,16 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
+        let cap = file.buffered_writer.inspect_mutable().capacity();
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
         file.write_raw(&content, &ctx).await.unwrap();
@@ -390,23 +468,21 @@ mod tests {
         // assert the state is as this test expects it to be
         assert_eq!(
             &file.load_to_io_buf(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
+            &content[0..cap * 2 + cap / 2]
         );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path()
-            .metadata()
-            .unwrap();
+        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
         assert_eq!(
             md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
+            2 * cap.into_u64(),
+            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
         );
         assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
+            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &content[cap..cap * 2]
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_mutable()[0..cap / 2],
+            &content[cap * 2..cap * 2 + cap / 2]
         );
     }
 
@@ -422,19 +498,19 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
-        file.write_raw(&content, &ctx).await.unwrap();
+        let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
 
         let test_read = |start: usize, len: usize| {
             let file = &file;
@@ -454,16 +530,38 @@ mod tests {
             }
         };
 
+        let test_read_all_offset_combinations = || {
+            async move {
+                test_read(align, align).await;
+                // border onto edge of file
+                test_read(cap - align, align).await;
+                // read across file and buffer
+                test_read(cap - align, 2 * align).await;
+                // stay from start of maybe flushed buffer
+                test_read(cap, align).await;
+                // completely within maybe flushed buffer
+                test_read(cap + align, align).await;
+                // border onto edge of maybe flushed buffer.
+                test_read(cap * 2 - align, align).await;
+                // read across maybe flushed and mutable buffer
+                test_read(cap * 2 - align, 2 * align).await;
+                // read across three segments
+                test_read(cap - align, cap + 2 * align).await;
+                // completely within mutable buffer
+                test_read(cap * 2 + align, align).await;
+            }
+        };
+
         // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
+        assert!(align < cap, "test assumption");
+        assert!(cap % align == 0);
+
+        // test reads at different flush stages.
+        let not_started = control.unwrap().into_not_started();
+        test_read_all_offset_combinations().await;
+        let in_progress = not_started.ready_to_flush();
+        test_read_all_offset_combinations().await;
+        in_progress.wait_until_flush_is_done().await;
+        test_read_all_offset_combinations().await;
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4bb1bbf3cf..89b935947d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -681,6 +681,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
         local_path: &Utf8Path,
+        gate: &utils::sync::gate::Gate,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, DownloadError> {
@@ -700,6 +701,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 local_path,
+                gate,
                 cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 739615be9c..c5ae466f3a 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -26,9 +27,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-#[cfg_attr(target_os = "macos", allow(unused_imports))]
-use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -60,6 +59,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
     local_path: &Utf8Path,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -88,7 +88,9 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
+        || async {
+            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+        },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -148,6 +150,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -205,13 +208,16 @@ async fn download_object<'a>(
         }
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
+            use crate::virtual_file::owned_buffers_io;
             async {
-                let destination_file = VirtualFile::create(dst_path, ctx)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+                let destination_file = Arc::new(
+                    VirtualFile::create(dst_path, ctx)
+                        .await
+                        .with_context(|| {
+                            format!("create a destination file for layer '{dst_path}'")
+                        })
+                        .map_err(DownloadError::Other)?,
+                );
 
                 let mut download = storage
                     .download(src_path, &DownloadOpts::default(), cancel)
@@ -219,14 +225,16 @@ async fn download_object<'a>(
 
                 pausable_failpoint!("before-downloading-layer-stream-pausable");
 
+                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+                    destination_file,
+                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
+                    ctx,
+                );
+
                 // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
-                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
@@ -234,10 +242,10 @@ async fn download_object<'a>(
                             Ok(chunk) => chunk,
                             Err(e) => return Err(e),
                         };
-                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
+                        buffered.write_buffered_borrowed(&chunk, ctx).await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
-                    Ok(size_tracking.into_inner())
+                    let inner = buffered.flush_and_into_inner(ctx).await?;
+                    Ok(inner)
                 }
                 .await?;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 701e4cf04b..395e34e404 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1183,6 +1183,7 @@ impl<'a> TenantDownloader<'a> {
             &layer.name,
             &layer.metadata,
             &local_path,
+            &self.secondary_state.gate,
             &self.secondary_state.cancel,
             ctx,
         )
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index af6112d535..71e53da20f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -555,13 +555,12 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a9f1189b41..8933e8ceb1 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1149,6 +1149,7 @@ impl LayerInner {
                 &self.desc.layer_name(),
                 &self.metadata(),
                 &self.path,
+                &timeline.gate,
                 &timeline.cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1414bef0a5..fc741826ab 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3455,7 +3455,6 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
-        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
 
         let last_record_lsn = self.get_last_record_lsn();
         ensure!(
@@ -3472,7 +3471,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
-                gate_guard,
+                &self.gate,
                 ctx,
             )
             .await?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 4293a44dca..3888e7f86a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -182,7 +182,7 @@ impl OpenLayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
@@ -212,15 +212,9 @@ impl OpenLayerManager {
                 lsn
             );
 
-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
+                    .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b9f8c7ea20..8a7f4a4bf5 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
 use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
-use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
+use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io {
     pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
-    pub(crate) mod util {
-        pub(crate) mod size_tracking_writer;
-    }
 }
 
 #[derive(Debug)]
@@ -221,7 +218,7 @@ impl VirtualFile {
         self.inner.read_exact_at_page(page, offset, ctx).await
     }
 
-    pub async fn write_all_at<Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBufAligned + Send>(
         &self,
         buf: FullSlice<Buf>,
         offset: u64,
@@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner {
 }
 
 impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    async fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
-        res.map(move |v| (v, buf))
+    ) -> std::io::Result<FullSlice<Buf>> {
+        let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
+        res.map(|_| buf)
     }
 }
 
@@ -1451,7 +1448,7 @@ mod tests {
                 }
             }
         }
-        async fn write_all_at<Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBufAligned + Send>(
             &self,
             buf: FullSlice<Buf>,
             offset: u64,
@@ -1594,6 +1591,7 @@ mod tests {
             &ctx,
         )
         .await?;
+
         file_a
             .write_all(b"foobar".to_vec().slice_len(), &ctx)
             .await?;
@@ -1652,10 +1650,10 @@ mod tests {
         )
         .await?;
         file_b
-            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
             .await?;
         file_b
-            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
             .await?;
 
         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
index 933b78a13b..6b9992643f 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
 }
 
 /// Alignment at compile time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct ConstAlign<const A: usize>;
 
 impl<const A: usize> Alignment for ConstAlign<A> {
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
 }
 
 /// Alignment at run time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct RuntimeAlign {
     align: usize,
 }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
index 2fba6d699b..a5c26cd746 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -3,9 +3,10 @@ use std::{
     sync::Arc,
 };
 
-use super::{alignment::Alignment, raw::RawAlignedBuffer};
+use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
 
 /// An shared, immutable aligned buffer type.
+#[derive(Clone, Debug)]
 pub struct AlignedBuffer<A: Alignment> {
     /// Shared raw buffer.
     raw: Arc<RawAlignedBuffer<A>>,
@@ -86,6 +87,13 @@ impl<A: Alignment> AlignedBuffer<A> {
             range: begin..end,
         }
     }
+
+    /// Returns the mutable aligned buffer, if the immutable aligned buffer
+    /// has exactly one strong reference. Otherwise returns `None`.
+    pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
+        let raw = Arc::into_inner(self.raw)?;
+        Some(AlignedBufferMut::from_raw(raw))
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBuffer<A> {
@@ -108,6 +116,14 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
     }
 }
 
+impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
+    fn from(value: &[u8; N]) -> Self {
+        let mut buf = AlignedBufferMut::with_capacity(N);
+        buf.extend_from_slice(value);
+        buf.freeze()
+    }
+}
+
 /// SAFETY: the underlying buffer references a stable memory region.
 unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
     fn stable_ptr(&self) -> *const u8 {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
index b3675d1aea..d2f5e206bb 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, DerefMut};
+use std::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};
 
 use super::{
     alignment::{Alignment, ConstAlign},
@@ -46,6 +49,11 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
 }
 
 impl<A: Alignment> AlignedBufferMut<A> {
+    /// Constructs a mutable aligned buffer from raw.
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
+        AlignedBufferMut { raw }
+    }
+
     /// Returns the total number of bytes the buffer can hold.
     #[inline]
     pub fn capacity(&self) -> usize {
@@ -128,6 +136,39 @@ impl<A: Alignment> AlignedBufferMut<A> {
         let len = self.len();
         AlignedBuffer::from_raw(self.raw, 0..len)
     }
+
+    /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
+    #[inline]
+    pub fn extend_from_slice(&mut self, extend: &[u8]) {
+        let cnt = extend.len();
+        self.reserve(cnt);
+
+        // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
+        unsafe {
+            let dst = self.spare_capacity_mut();
+            // Reserved above
+            debug_assert!(dst.len() >= cnt);
+
+            core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
+        }
+        // SAFETY: We do have at least `cnt` bytes remaining before advance.
+        unsafe {
+            bytes::BufMut::advance_mut(self, cnt);
+        }
+    }
+
+    /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
+    #[inline]
+    fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
+        // SAFETY: we guarantees that the `Self::capacity()` bytes from
+        // `Self::as_mut_ptr()` are allocated.
+        unsafe {
+            let ptr = self.as_mut_ptr().add(self.len());
+            let len = self.capacity() - self.len();
+
+            core::slice::from_raw_parts_mut(ptr.cast(), len)
+        }
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBufferMut<A> {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
index dba695196e..4ea6b17744 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -1,9 +1,15 @@
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{IoBuf, IoBufMut};
 
-use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};
 
+/// A marker trait for a mutable aligned buffer type.
 pub trait IoBufAlignedMut: IoBufMut {}
 
+/// A marker trait for an aligned buffer type.
+pub trait IoBufAligned: IoBuf {}
+
 impl IoBufAlignedMut for IoBufferMut {}
 
+impl IoBufAligned for IoBuffer {}
+
 impl IoBufAlignedMut for PageWriteGuardBuf {}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index c3940cf6ce..525f447b6d 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use super::write::CheapCloneForRead;
+
 /// The true owned equivalent for Rust [`slice`]. Use this for the write path.
 ///
 /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
@@ -43,6 +45,18 @@ where
     }
 }
 
+impl<B> CheapCloneForRead for FullSlice<B>
+where
+    B: IoBuf + CheapCloneForRead,
+{
+    fn cheap_clone(&self) -> Self {
+        let bounds = self.slice.bounds();
+        let clone = self.slice.get_ref().cheap_clone();
+        let slice = clone.slice(bounds);
+        Self { slice }
+    }
+}
+
 pub(crate) trait IoBufExt {
     /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
     fn slice_len(self) -> FullSlice<Self>
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
deleted file mode 100644
index efcb61ba65..0000000000
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use crate::{
-    context::RequestContext,
-    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
-};
-use tokio_epoll_uring::IoBuf;
-
-pub struct Writer<W> {
-    dst: W,
-    bytes_amount: u64,
-}
-
-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
-        Self {
-            dst,
-            bytes_amount: 0,
-        }
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
-    /// Returns the wrapped `VirtualFile` object as well as the number
-    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
-        (self.bytes_amount, self.dst)
-    }
-}
-
-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
-        self.bytes_amount += u64::try_from(nwritten).unwrap();
-        Ok((nwritten, buf))
-    }
-}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 568cf62e56..20bf878123 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,55 +1,88 @@
-use bytes::BytesMut;
+mod flush;
+use std::sync::Arc;
+
+use flush::FlushHandle;
 use tokio_epoll_uring::IoBuf;
 
-use crate::context::RequestContext;
+use crate::{
+    context::RequestContext,
+    virtual_file::{IoBuffer, IoBufferMut},
+};
 
-use super::io_buf_ext::{FullSlice, IoBufExt};
+use super::{
+    io_buf_aligned::IoBufAligned,
+    io_buf_ext::{FullSlice, IoBufExt},
+};
+
+pub(crate) use flush::FlushControl;
+
+pub(crate) trait CheapCloneForRead {
+    /// Returns a cheap clone of the buffer.
+    fn cheap_clone(&self) -> Self;
+}
+
+impl CheapCloneForRead for IoBuffer {
+    fn cheap_clone(&self) -> Self {
+        // Cheap clone over an `Arc`.
+        self.clone()
+    }
+}
 
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+/// The owned buffers need to be aligned due to Direct IO requirements.
 pub trait OwnedAsyncWriter {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
+    ) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
 }
 
 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
 /// small writes into larger writes of size [`Buffer::cap`].
-///
-/// # Passthrough Of Large Writers
-///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
-///
-/// This pass-through is generally beneficial for throughput, but if
-/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
-/// unlimited large writes may cause latency or fairness issues.
-///
-/// In such cases, a different implementation that always buffers in memory
-/// may be preferable.
-pub struct BufferedWriter<B, W> {
-    writer: W,
+// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
+// since we would avoid copying majority of the data into the internal buffer.
+pub struct BufferedWriter<B: Buffer, W> {
+    writer: Arc<W>,
     /// invariant: always remains Some(buf) except
     /// - while IO is ongoing => goes back to Some() once the IO completed successfully
     /// - after an IO error => stays `None` forever
     ///
     /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    mutable: Option<B>,
+    /// A handle to the background flush task for writting data to disk.
+    flush_handle: FlushHandle<B::IoBuf, W>,
+    /// The number of bytes submitted to the background task.
+    bytes_submitted: u64,
 }
 
 impl<B, Buf, W> BufferedWriter<B, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
-    W: OwnedAsyncWriter,
+    B: Buffer<IoBuf = Buf> + Send + 'static,
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    /// Creates a new buffered writer.
+    ///
+    /// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
+    pub fn new(
+        writer: Arc<W>,
+        buf_new: impl Fn() -> B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: &RequestContext,
+    ) -> Self {
         Self {
-            writer,
-            buf: Some(buf),
+            writer: writer.clone(),
+            mutable: Some(buf_new()),
+            flush_handle: FlushHandle::spawn_new(
+                writer,
+                buf_new(),
+                gate_guard,
+                ctx.attached_child(),
+            ),
+            bytes_submitted: 0,
         }
     }
 
@@ -57,87 +90,70 @@ where
         &self.writer
     }
 
+    /// Returns the number of bytes submitted to the background flush task.
+    pub fn bytes_submitted(&self) -> u64 {
+        self.bytes_submitted
+    }
+
     /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
+    pub fn inspect_mutable(&self) -> &B {
+        self.mutable()
+    }
+
+    /// Gets a reference to the maybe flushed read-only buffer.
+    /// Returns `None` if the writer has not submitted any flush request.
+    pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
+        self.flush_handle.maybe_flushed.as_ref()
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+    pub async fn flush_and_into_inner(
+        mut self,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Arc<W>)> {
         self.flush(ctx).await?;
 
-        let Self { buf, writer } = self;
+        let Self {
+            mutable: buf,
+            writer,
+            mut flush_handle,
+            bytes_submitted: bytes_amount,
+        } = self;
+        flush_handle.shutdown().await?;
         assert!(buf.is_some());
-        Ok(writer)
+        Ok((bytes_amount, writer))
     }
 
+    /// Gets a reference to the mutable in-memory buffer.
     #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
+    fn mutable(&self) -> &B {
+        self.mutable
             .as_ref()
             .expect("must not use after we returned an error")
     }
 
-    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf + Send>(
+    pub async fn write_buffered_borrowed(
         &mut self,
-        chunk: FullSlice<S>,
+        chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<S>)> {
-        let chunk = chunk.into_raw_slice();
-
-        let chunk_len = chunk.len();
-        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
-            self.flush(ctx).await?;
-            // do a big write, bypassing `buf`
-            assert_eq!(
-                self.buf
-                    .as_ref()
-                    .expect("must not use after an error")
-                    .pending(),
-                0
-            );
-            let (nwritten, chunk) = self
-                .writer
-                .write_all(FullSlice::must_new(chunk), ctx)
-                .await?;
-            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
+    ) -> std::io::Result<usize> {
+        let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
         }
-        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
-            }
-        }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, FullSlice::must_new(chunk)))
+        Ok(len)
     }
 
-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(
+    /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
+    pub(crate) async fn write_buffered_borrowed_controlled(
         &mut self,
         mut chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
+    ) -> std::io::Result<(usize, Option<FlushControl>)> {
         let chunk_len = chunk.len();
+        let mut control: Option<FlushControl> = None;
         while !chunk.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
+            let buf = self.mutable.as_mut().expect("must not use after an error");
             let need = buf.cap() - buf.pending();
             let have = chunk.len();
             let n = std::cmp::min(need, have);
@@ -145,26 +161,27 @@ where
             chunk = &chunk[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
+                if let Some(control) = control.take() {
+                    control.release().await;
+                }
+                control = self.flush(ctx).await?;
             }
         }
-        Ok(chunk_len)
+        Ok((chunk_len, control))
     }
 
-    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
-        let buf = self.buf.take().expect("must not use after an error");
+    #[must_use = "caller must explcitly check the flush control"]
+    async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
+        let buf = self.mutable.take().expect("must not use after an error");
         let buf_len = buf.pending();
         if buf_len == 0 {
-            self.buf = Some(buf);
-            return Ok(());
+            self.mutable = Some(buf);
+            return Ok(None);
         }
-        let slice = buf.flush();
-        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
-        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(
-            slice.into_raw_slice().into_inner(),
-        ));
-        Ok(())
+        let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
+        self.bytes_submitted += u64::try_from(buf_len).unwrap();
+        self.mutable = Some(recycled);
+        Ok(Some(flush_control))
     }
 }
 
@@ -192,64 +209,77 @@ pub trait Buffer {
     fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
 }
 
-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
+impl Buffer for IoBufferMut {
+    type IoBuf = IoBuffer;
 
-    #[inline(always)]
     fn cap(&self) -> usize {
         self.capacity()
     }
 
     fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
+        if self.len() + other.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+
+        IoBufferMut::extend_from_slice(self, other);
     }
 
-    #[inline(always)]
     fn pending(&self) -> usize {
         self.len()
     }
 
-    fn flush(self) -> FullSlice<BytesMut> {
-        self.slice_len()
+    fn flush(self) -> FullSlice<Self::IoBuf> {
+        self.freeze().slice_len()
     }
 
-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
-impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        _: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), buf))
+    /// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let mut recycled = iobuf
+            .into_mut()
+            .expect("buffer should only have one strong reference");
+        recycled.clear();
+        recycled
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
+    use std::sync::Mutex;
 
     use super::*;
     use crate::context::{DownloadBehavior, RequestContext};
     use crate::task_mgr::TaskKind;
 
-    #[derive(Default)]
+    #[derive(Default, Debug)]
     struct RecorderWriter {
-        writes: Vec<Vec<u8>>,
+        /// record bytes and write offsets.
+        writes: Mutex<Vec<(Vec<u8>, u64)>>,
     }
+
+    impl RecorderWriter {
+        /// Gets recorded bytes and write offsets.
+        fn get_writes(&self) -> Vec<Vec<u8>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .iter()
+                .map(|(buf, _)| buf.clone())
+                .collect()
+        }
+    }
+
     impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<Buf: IoBuf + Send>(
-            &mut self,
+        async fn write_all_at<Buf: IoBufAligned + Send>(
+            &self,
             buf: FullSlice<Buf>,
+            offset: u64,
             _: &RequestContext,
-        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-            self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), buf))
+        ) -> std::io::Result<FullSlice<Buf>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .push((Vec::from(&buf[..]), offset));
+            Ok(buf)
         }
     }
 
@@ -257,71 +287,21 @@ mod tests {
         RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
     }
 
-    macro_rules! write {
-        ($writer:ident, $data:literal) => {{
-            $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
-                .await?;
-        }};
-    }
-
     #[tokio::test]
-    async fn test_buffered_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"b");
-        write!(writer, b"c");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"abc");
-        write!(writer, b"de");
-        write!(writer, b"");
-        write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"bc");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
         let ctx = test_ctx();
         let ctx = &ctx;
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let recorder = Arc::new(RecorderWriter::default());
+        let gate = utils::sync::gate::Gate::default();
+        let mut writer = BufferedWriter::<_, RecorderWriter>::new(
+            recorder,
+            || IoBufferMut::with_capacity(2),
+            gate.enter()?,
+            ctx,
+        );
 
         writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"", ctx).await?;
         writer.write_buffered_borrowed(b"d", ctx).await?;
         writer.write_buffered_borrowed(b"e", ctx).await?;
         writer.write_buffered_borrowed(b"fg", ctx).await?;
@@ -329,9 +309,9 @@ mod tests {
         writer.write_buffered_borrowed(b"j", ctx).await?;
         writer.write_buffered_borrowed(b"klmno", ctx).await?;
 
-        let recorder = writer.flush_and_into_inner(ctx).await?;
+        let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
         assert_eq!(
-            recorder.writes,
+            recorder.get_writes(),
             {
                 let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
                 expect
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
new file mode 100644
index 0000000000..9ce8b311bb
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -0,0 +1,314 @@
+use std::sync::Arc;
+
+use utils::sync::duplex;
+
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
+};
+
+use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
+
+/// A handle to the flush task.
+pub struct FlushHandle<Buf, W> {
+    inner: Option<FlushHandleInner<Buf, W>>,
+    /// Immutable buffer for serving tail reads.
+    /// `None` if no flush request has been submitted.
+    pub(super) maybe_flushed: Option<FullSlice<Buf>>,
+}
+
+pub struct FlushHandleInner<Buf, W> {
+    /// A bi-directional channel that sends (buffer, offset) for writes,
+    /// and receives recyled buffer.
+    channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
+    /// Join handle for the background flush task.
+    join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
+}
+
+struct FlushRequest<Buf> {
+    slice: FullSlice<Buf>,
+    offset: u64,
+    #[cfg(test)]
+    ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    #[cfg(test)]
+    done_flush_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(not(test))]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let request = FlushRequest { slice, offset };
+    let control = FlushControl::untracked();
+
+    (request, control)
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(test)]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
+    let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
+    let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
+
+    let request = FlushRequest {
+        slice,
+        offset,
+        ready_to_flush_rx,
+        done_flush_tx,
+    };
+    (request, control)
+}
+
+/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
+#[cfg(test)]
+pub(crate) struct FlushControl {
+    not_started: FlushNotStarted,
+}
+
+#[cfg(not(test))]
+pub(crate) struct FlushControl;
+
+impl FlushControl {
+    #[cfg(test)]
+    fn not_started(
+        ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+        done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Self {
+        FlushControl {
+            not_started: FlushNotStarted {
+                ready_to_flush_tx,
+                done_flush_rx,
+            },
+        }
+    }
+
+    #[cfg(not(test))]
+    fn untracked() -> Self {
+        FlushControl
+    }
+
+    /// In tests, turn flush control into a not started state.
+    #[cfg(test)]
+    pub(crate) fn into_not_started(self) -> FlushNotStarted {
+        self.not_started
+    }
+
+    /// Release control to the submitted buffer.
+    ///
+    /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
+    pub async fn release(self) {
+        #[cfg(test)]
+        {
+            self.not_started
+                .ready_to_flush()
+                .wait_until_flush_is_done()
+                .await;
+        }
+    }
+}
+
+impl<Buf, W> FlushHandle<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
+{
+    /// Spawns a new background flush task and obtains a handle.
+    ///
+    /// Note: The background task so we do not need to explicitly maintain a queue of buffers.
+    pub fn spawn_new<B>(
+        file: Arc<W>,
+        buf: B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
+        let (front, back) = duplex::mpsc::channel(1);
+
+        let join_handle = tokio::spawn(async move {
+            FlushBackgroundTask::new(back, file, gate_guard, ctx)
+                .run(buf.flush())
+                .await
+        });
+
+        FlushHandle {
+            inner: Some(FlushHandleInner {
+                channel: front,
+                join_handle,
+            }),
+            maybe_flushed: None,
+        }
+    }
+
+    /// Submits a buffer to be flushed in the background task.
+    /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
+    /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
+    /// clear `maybe_flushed`.
+    pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        let slice = buf.flush();
+
+        // Saves a buffer for read while flushing. This also removes reference to the old buffer.
+        self.maybe_flushed = Some(slice.cheap_clone());
+
+        let (request, flush_control) = new_flush_op(slice, offset);
+
+        // Submits the buffer to the background task.
+        let submit = self.inner_mut().channel.send(request).await;
+        if submit.is_err() {
+            return self.handle_error().await;
+        }
+
+        // Wait for an available buffer from the background flush task.
+        // This is the BACKPRESSURE mechanism: if the flush task can't keep up,
+        // then the write path will eventually wait for it here.
+        let Some(recycled) = self.inner_mut().channel.recv().await else {
+            return self.handle_error().await;
+        };
+
+        // The only other place that could hold a reference to the recycled buffer
+        // is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
+        let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
+        Ok((recycled, flush_control))
+    }
+
+    async fn handle_error<T>(&mut self) -> std::io::Result<T> {
+        Err(self
+            .shutdown()
+            .await
+            .expect_err("flush task only disconnects duplex if it exits with an error"))
+    }
+
+    /// Cleans up the channel, join the flush task.
+    pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
+        let handle = self
+            .inner
+            .take()
+            .expect("must not use after we returned an error");
+        drop(handle.channel.tx);
+        handle.join_handle.await.unwrap()
+    }
+
+    /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
+    /// This only happens if the handle is used after an error.
+    fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
+        self.inner
+            .as_mut()
+            .expect("must not use after we returned an error")
+    }
+}
+
+/// A background task for flushing data to disk.
+pub struct FlushBackgroundTask<Buf, W> {
+    /// A bi-directional channel that receives (buffer, offset) for writes,
+    /// and send back recycled buffer.
+    channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+    /// A writter for persisting data to disk.
+    writer: Arc<W>,
+    ctx: RequestContext,
+    /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl<Buf, W> FlushBackgroundTask<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync,
+    W: OwnedAsyncWriter + Sync + 'static,
+{
+    /// Creates a new background flush task.
+    fn new(
+        channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+        file: Arc<W>,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self {
+        FlushBackgroundTask {
+            channel,
+            writer: file,
+            _gate_guard: gate_guard,
+            ctx,
+        }
+    }
+
+    /// Runs the background flush task.
+    /// The passed in slice is immediately sent back to the flush handle through the duplex channel.
+    async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
+        // Sends the extra buffer back to the handle.
+        self.channel.send(slice).await.map_err(|_| {
+            std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
+        })?;
+
+        //  Exit condition: channel is closed and there is no remaining buffer to be flushed
+        while let Some(request) = self.channel.recv().await {
+            #[cfg(test)]
+            {
+                // In test, wait for control to signal that we are ready to flush.
+                if request.ready_to_flush_rx.await.is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Write slice to disk at `offset`.
+            let slice = self
+                .writer
+                .write_all_at(request.slice, request.offset, &self.ctx)
+                .await?;
+
+            #[cfg(test)]
+            {
+                // In test, tell control we are done flushing buffer.
+                if request.done_flush_tx.send(()).is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
+            if self.channel.send(slice).await.is_err() {
+                // Although channel is closed. Still need to finish flushing the remaining buffers.
+                continue;
+            }
+        }
+
+        Ok(self.writer)
+    }
+}
+
+#[cfg(test)]
+pub(crate) struct FlushNotStarted {
+    ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushInProgress {
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushDone;
+
+#[cfg(test)]
+impl FlushNotStarted {
+    /// Signals the background task the buffer is ready to flush to disk.
+    pub fn ready_to_flush(self) -> FlushInProgress {
+        self.ready_to_flush_tx
+            .send(())
+            .map(|_| FlushInProgress {
+                done_flush_rx: self.done_flush_rx,
+            })
+            .unwrap()
+    }
+}
+
+#[cfg(test)]
+impl FlushInProgress {
+    /// Waits until background flush is done.
+    pub async fn wait_until_flush_is_done(self) -> FlushDone {
+        self.done_flush_rx.await.unwrap();
+        FlushDone
+    }
+}

From 0bab7e30863c7d41087decf351517c0fb5a2e1b5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 17:42:17 +0000
Subject: [PATCH 062/117] chore: update clap (#10009)

This updates clap to use a new version of anstream
---
 Cargo.lock | 62 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 38158b7aec..de8785f87e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
@@ -123,12 +123,12 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.1"
+version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
 dependencies = [
  "anstyle",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1167,35 +1167,33 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
- "bitflags 1.3.2",
  "clap_lex",
- "strsim",
+ "strsim 0.11.1",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.3.0"
+version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck 0.4.1",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -1203,9 +1201,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
@@ -1614,7 +1612,7 @@ dependencies = [
  "ident_case",
  "proc-macro2",
  "quote",
- "strsim",
+ "strsim 0.10.0",
  "syn 2.0.90",
 ]
 
@@ -1812,7 +1810,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
  "darling",
  "either",
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -2465,12 +2463,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2888,6 +2880,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -3169,7 +3167,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -4458,7 +4456,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
- "heck 0.5.0",
+ "heck",
  "itertools 0.12.1",
  "log",
  "multimap",
@@ -6166,6 +6164,12 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "strum"
 version = "0.26.3"
@@ -6178,7 +6182,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "rustversion",

From 131585eb6bd206907a969f8eab44017b282d1556 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 21:07:44 +0000
Subject: [PATCH 063/117] chore: update rust-postgres (#10002)

Like #9931 but without rebasing upstream just yet, to try and minimise
the differences.

Removes all proxy-specific commits from the rust-postgres fork, now that
proxy no longer depends on them. Merging upstream changes to come later.
---
 Cargo.lock | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de8785f87e..62f06d45bd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4169,7 +4169,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4182,7 +4182,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4195,7 +4195,6 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
- "tokio",
 ]
 
 [[package]]
@@ -4217,7 +4216,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6547,7 +6546,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "async-trait",
  "byteorder",

From ed2d89211306ca892dce41159bc1cc8e9e1646a5 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 21:16:09 -0500
Subject: [PATCH 064/117] pageserver: fix buffered-writer on macos build
 (#10019)

## Problem

In https://github.com/neondatabase/neon/pull/9693, we forgot to check
macos build. The [CI
run](https://github.com/neondatabase/neon/actions/runs/12164541897/job/33926455468)
on main showed that macos build failed with unused variables and dead
code.

## Summary of changes

- add `allow(dead_code)` and `allow(unused_variables)` to the relevant
code that is not used on macos.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++++---
 pageserver/src/virtual_file/owned_buffers_io/write.rs    | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c5ae466f3a..d15f161fb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,7 +6,6 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -27,7 +26,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -150,7 +149,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
-    gate: &utils::sync::gate::Gate,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -209,6 +208,8 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io;
+            use crate::virtual_file::IoBufferMut;
+            use std::sync::Arc;
             async {
                 let destination_file = Arc::new(
                     VirtualFile::create(dst_path, ctx)
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 20bf878123..7299d83703 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -132,6 +132,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered_borrowed(
         &mut self,
         chunk: &[u8],

From ffc9c33eb2383f9970a246bce8712772c7696080 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 5 Dec 2024 07:30:38 +0200
Subject: [PATCH 065/117] proxy: Present new auth backend cplane_proxy_v1
 (#10012)

Implement a new auth backend based on the current Neon backend to switch
to the new Proxy V1 cplane API.

Implements [#21048](https://github.com/neondatabase/cloud/issues/21048)
---
 proxy/src/auth/backend/mod.rs                 |   4 +
 proxy/src/bin/proxy.rs                        |  99 +++-
 .../control_plane/client/cplane_proxy_v1.rs   | 514 ++++++++++++++++++
 proxy/src/control_plane/client/mod.rs         |   7 +
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/control_plane/messages.rs           |  10 +
 6 files changed, 634 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/control_plane/client/cplane_proxy_v1.rs

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 84a572dcf9..1bad7b3086 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
+                ControlPlaneClient::ProxyV1(endpoint) => fmt
+                    .debug_tuple("ControlPlane::ProxyV1")
+                    .field(&endpoint.url())
+                    .finish(),
                 ControlPlaneClient::Neon(endpoint) => fmt
                     .debug_tuple("ControlPlane::Neon")
                     .field(&endpoint.url())
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c929b97d78..99144acef0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -46,6 +46,9 @@ enum AuthBackendType {
     #[value(name("console"), alias("cplane"))]
     ControlPlane,
 
+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
     #[value(name("link"), alias("control-redirect"))]
     ConsoleRedirect,
 
@@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> {
                         .instrument(span),
                 );
             }
+        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
         }
     }
 
@@ -662,6 +698,65 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
         AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
@@ -697,13 +792,15 @@ fn build_auth_backend(
             )?));
             tokio::spawn(locks.garbage_collect_worker());
 
-            let url = args.auth_endpoint.parse()?;
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
             let endpoint = http::Endpoint::new(url, http::new_client());
 
             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
             let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
                 args.control_plane_token.clone(),
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
new file mode 100644
index 0000000000..e33a37f643
--- /dev/null
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -0,0 +1,514 @@
+//! Production console backend.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
+use futures::TryFutureExt;
+use postgres_client::config::SslMode;
+use tokio::time::Instant;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestContext;
+use crate::control_plane::caches::ApiCaches;
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::{
+    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
+
+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
+pub struct NeonControlPlaneClient {
+    endpoint: http::Endpoint,
+    pub caches: &'static ApiCaches,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
+}
+
+impl NeonControlPlaneClient {
+    /// Construct an API object containing the auth parameters.
+    pub fn new(
+        endpoint: http::Endpoint,
+        jwt: Arc<str>,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks<EndpointCacheKey>,
+        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    ) -> Self {
+        Self {
+            endpoint,
+            caches,
+            locks,
+            wake_compute_endpoint_rate_limiter,
+            jwt,
+        }
+    }
+
+    pub(crate) fn url(&self) -> &str {
+        self.endpoint.url().as_str()
+    }
+
+    async fn do_get_auth_info(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+        {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let request = self
+                .endpoint
+                .get_path("get_endpoint_access_control")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                    ("role", user_info.user.as_str()),
+                ])
+                .build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = match parse_body::<GetEndpointAccessControl>(response).await {
+                Ok(body) => body,
+                // Error 404 is special: it's ok not to have a secret.
+                // TODO(anna): retry
+                Err(e) => {
+                    return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
+                        Ok(AuthInfo::default())
+                    } else {
+                        Err(e.into())
+                    };
+                }
+            };
+
+            // Ivan: don't know where it will be used, so I leave it here
+            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
+            Ok(AuthInfo {
+                secret,
+                allowed_ips,
+                project_id: body.project_id,
+            })
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
+        .await
+    }
+
+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+        {
+            return Err(GetEndpointJwksError::EndpointNotFound);
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
+        .await
+    }
+
+    async fn do_wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<NodeInfo, WakeComputeError> {
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let mut request_builder = self
+                .endpoint
+                .get_path("wake_compute")
+                .header("X-Request-ID", &request_id)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                ]);
+
+            let options = user_info.options.to_deep_object();
+            if !options.is_empty() {
+                request_builder = request_builder.query(&options);
+            }
+
+            let request = request_builder.build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = parse_body::<WakeCompute>(response).await?;
+
+            // Unfortunately, ownership won't let us use `Option::ok_or` here.
+            let (host, port) = match parse_host_port(&body.address) {
+                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
+                Some(x) => x,
+            };
+
+            // Don't set anything but host and port! This config will be cached.
+            // We'll set username and such later using the startup message.
+            // TODO: add more type safety (in progress).
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+
+            let node = NodeInfo {
+                config,
+                aux: body.aux,
+                allow_self_signed_compute: false,
+            };
+
+            Ok(node)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
+        .await
+    }
+}
+
+impl super::ControlPlaneApi for NeonControlPlaneClient {
+    #[tracing::instrument(skip_all)]
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        let user = &user_info.user;
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_ips),
+            );
+            ctx.set_project_id(project_id);
+        }
+        // When we just got a secret, we don't need to invalidate it.
+        Ok(Cached::new_uncached(auth_info.secret))
+    }
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
+            return Ok((allowed_ips, None));
+        }
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let user = &user_info.user;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, WakeComputeError> {
+        let key = user_info.endpoint_cache_key();
+
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
+        // Every time we do a wakeup http request, the compute node will stay up
+        // for some time (highly depends on the console's scale-to-zero policy);
+        // The connection info remains the same during that period of time,
+        // which means that we might cache it to reduce the load and latency.
+        check_cache!();
+
+        let permit = self.locks.get_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
+            check_cache!();
+        }
+
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
+
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
+
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
+
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(err.clone()),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                        err,
+                    )))
+                }
+                err => return Err(err),
+            },
+        }
+    }
+}
+
+/// Parse http response body, taking status code into account.
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
+    response: http::Response,
+) -> Result<T, ControlPlaneError> {
+    let status = response.status();
+    if status.is_success() {
+        // We shouldn't log raw body because it may contain secrets.
+        info!("request succeeded, processing the body");
+        return Ok(response.json().await?);
+    }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);
+
+    // Don't throw an error here because it's not as important
+    // as the fact that the request itself has failed.
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
+        warn!("failed to parse error body: {e}");
+        ControlPlaneErrorMessage {
+            error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
+        }
+    });
+    body.http_status_code = status;
+
+    warn!("console responded with an error ({status}): {body:?}");
+    Err(ControlPlaneError::Message(Box::new(body)))
+}
+
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_host_port_v4() {
+        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
+        assert_eq!(host, "127.0.0.1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
+}
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index f8f74372f0..7ef5a9c9fd 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,3 +1,4 @@
+pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
@@ -27,6 +28,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
+    /// New Proxy V1 control plane API
+    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
     /// Current Management API (V2).
     Neon(neon::NeonControlPlaneClient),
     /// Local mock control plane.
@@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
@@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
@@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
+            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
+            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
             Self::Neon(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5c204ae1d7..bf62c0d6ab 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,4 +1,4 @@
-//! Production console backend.
+//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
 
 use std::sync::Arc;
 use std::time::Duration;
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8762ba874b..2662ab85f9 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret {
     pub(crate) project_id: Option<ProjectIdInt>,
 }
 
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/get_endpoint_access_control` API method.
+#[derive(Deserialize)]
+pub(crate) struct GetEndpointAccessControl {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+}
+
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for GetRoleSecret {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

From db793044167d1b81cea7f2a2a57a189711d0d683 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 5 Dec 2024 18:29:21 +0100
Subject: [PATCH 066/117] storage_controller: increase shard scan timeout
 (#10000)

## Problem

The node shard scan timeout of 1 second is a bit too aggressive, and
we've seen this cause test failures. The scans are performed in parallel
across nodes, and the entire operation has a 15 second timeout.

Resolves #9801.

## Summary of changes

Increase the timeout to 5 seconds. This is still enough to time out on a
network failure and retry successfully within 15 seconds.
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 92ec58cb4d..083c78233a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -789,7 +789,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(1);
+                    let timeout = Duration::from_secs(5);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },

From 13e810574029953ab4f5002724ad853fc2c39922 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 5 Dec 2024 18:57:25 +0100
Subject: [PATCH 067/117] feat(compute): Allow specifying the reconfiguration
 concurrency (#10006)

## Problem

We need a higher concurrency during reconfiguration in case of many DBs,
but the instance is already running and used by the client. We can
easily get out of `max_connections` limit, and the current code won't
handle that.

## Summary of changes

Default to 1, but also allow control plane to override this value for
specific projects. It's also recommended to bump
`superuser_reserved_connections` += `reconfigure_concurrency` for such
projects to ensure that we always have enough spare connections for
reconfiguration process to succeed.

Quick workaround for neondatabase/cloud#17846
---
 compute_tools/src/compute.rs  |  7 +------
 control_plane/src/endpoint.rs |  1 +
 libs/compute_api/src/spec.rs  | 25 ++++++++++++++++++++++---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0d1e6d680f..d72a04f2f9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;
 
         // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 1ca6dc43c4..360857f365 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -618,6 +618,7 @@ impl Endpoint {
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
+            reconfigure_concurrency: 1,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 8a447563dc..6d9c353cda 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;
 
+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
 
-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
     /// that no pg catalog mutations (like role creation, database creation,
     /// extension creation) need to be done on the actual database to start.
     #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
     // etc. GUCs in cluster.settings. TODO: Once the control plane has been
     // updated to fill these fields, we can make these non optional.
     pub tenant_id: Option<TenantId>,
-
     pub timeline_id: Option<TimelineId>,
-
     pub pageserver_connstring: Option<String>,
 
     #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
     /// Local Proxy configuration used for JWT authentication
     #[serde(default)]
     pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {
 
         // Features list defaults to empty vector.
         assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
     }
 
     #[test]

From c0ba4169676300c72ec3b567996c2604be93b136 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:04:33 -0600
Subject: [PATCH 068/117] Add compute_logical_snapshots_bytes metric (#9887)

This metric exposes the size of all non-temporary logical snapshot
files.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet              |  1 +
 .../compute_logical_snapshots_bytes.15.sql      |  7 +++++++
 .../compute_logical_snapshots_bytes.libsonnet   | 17 +++++++++++++++++
 .../compute_logical_snapshots_bytes.sql         |  9 +++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 75d69c7b68..aa6cc1cfc8 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
     import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
     import 'sql_exporter/compute_max_connections.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
     import 'sql_exporter/compute_subscriptions_count.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
new file mode 100644
index 0000000000..73a9c11405
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
new file mode 100644
index 0000000000..8e1792d386
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
new file mode 100644
index 0000000000..16da899de2
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;

From 71f38d135467ef8691f062c62fa5d8f3bf49ea6d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:37:17 -0800
Subject: [PATCH 069/117] feat(pageserver): support schedule gc-compaction
 (#9809)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

gc-compaction can take a long time. This patch adds support for
scheduling a gc-compaction job. The compaction loop will first handle
L0->L1 compaction, and then gc compaction. The scheduled jobs are stored
in a non-persistent queue within the tenant structure.

This will be the building block for the partial compaction trigger -- if
the system determines that we need to do a gc compaction, it will
partition the keyspace and schedule several jobs. Each of these jobs
will run for a short amount of time (i.e, 1 min). L0 compaction will be
prioritized over gc compaction.

## Summary of changes

* Add compaction scheduler in tenant.
* Run scheduled compaction in integration tests.
* Change the manual compaction API to allow schedule a compaction
instead of immediately doing it.
* Add LSN upper bound as gc-compaction parameter. If we schedule partial
compactions, gc_cutoff might move across different runs. Therefore, we
need to pass a pre-determined gc_cutoff beforehand. (TODO: support LSN
lower bound so that we can compact arbitrary "rectangle" in the layer
map)
* Refactor the gc_compaction internal interface.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/http/routes.rs                |  66 +++++--
 pageserver/src/tenant.rs                     | 171 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs            |  29 +++-
 pageserver/src/tenant/timeline/compaction.rs |  58 ++++---
 test_runner/regress/test_compaction.py       |  41 +++--
 5 files changed, 291 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e04f1460a8..b3981b4a8e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -1978,6 +1978,26 @@ async fn timeline_gc_handler(
     json_response(StatusCode::OK, gc_result)
 }
 
+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
     mut request: Request<Body>,
@@ -1987,7 +2007,7 @@ async fn timeline_compact_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
 
     let state = get_state(&request);
 
@@ -2012,22 +2032,42 @@ async fn timeline_compact_handler(
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
     let options = CompactOptions {
-        compact_range,
+        compact_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_range.clone()),
+        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
     };
 
+    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
         }
         json_response(StatusCode::OK, ())
     }
@@ -3301,6 +3341,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
         )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
             |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5a9e398586..306ec9f548 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,14 +37,18 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::CompactFlags;
+use timeline::CompactOptions;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -339,6 +343,11 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
     /// background warmup.
@@ -2953,27 +2962,68 @@ impl Tenant {
 
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
+            // pending_task_left == None: cannot compact, maybe still pending tasks
+            // pending_task_left == Some(true): compaction task left
+            // pending_task_left == Some(false): no compaction task left
             let pending_task_left = if *can_compact {
-                Some(
-                    timeline
-                        .compact(cancel, EnumSet::empty(), ctx)
-                        .instrument(info_span!("compact_timeline", %timeline_id))
-                        .await
-                        .inspect_err(|e| match e {
-                            timeline::CompactionError::ShuttingDown => (),
-                            timeline::CompactionError::Offload(_) => {
-                                // Failures to offload timelines do not trip the circuit breaker, because
-                                // they do not do lots of writes the way compaction itself does: it is cheap
-                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                let has_pending_l0_compaction_task = timeline
+                    .compact(cancel, EnumSet::empty(), ctx)
+                    .instrument(info_span!("compact_timeline", %timeline_id))
+                    .await
+                    .inspect_err(|e| match e {
+                        timeline::CompactionError::ShuttingDown => (),
+                        timeline::CompactionError::Offload(_) => {
+                            // Failures to offload timelines do not trip the circuit breaker, because
+                            // they do not do lots of writes the way compaction itself does: it is cheap
+                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                        }
+                        timeline::CompactionError::Other(e) => {
+                            self.compaction_circuit_breaker
+                                .lock()
+                                .unwrap()
+                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                        }
+                    })?;
+                if has_pending_l0_compaction_task {
+                    Some(true)
+                } else {
+                    let has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
+                    };
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
                             }
-                            timeline::CompactionError::Other(e) => {
-                                self.compaction_circuit_breaker
-                                    .lock()
-                                    .unwrap()
-                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                            }
-                        })?,
-                )
+                        }
+                    }
+                    Some(has_pending_scheduled_compaction_task)
+                }
             } else {
                 None
             };
@@ -2993,6 +3043,36 @@ impl Tenant {
         Ok(has_pending_task)
     }
 
+    /// Cancel scheduled compaction tasks
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
+        }
+    }
+
+    /// Schedule a compaction task for a timeline.
+    pub(crate) async fn schedule_compaction(
+        &self,
+        timeline_id: TimelineId,
+        options: CompactOptions,
+    ) -> tokio::sync::oneshot::Receiver<()> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+        });
+        rx
+    }
+
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
     // this happens during ingest: this background housekeeping is for freezing layers
     // that are open but haven't been written to for some time.
@@ -4005,6 +4085,7 @@ impl Tenant {
                 // use an extremely long backoff.
                 Some(Duration::from_secs(3600 * 24)),
             )),
+            scheduled_compaction_tasks: Mutex::new(Default::default()),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
@@ -9163,6 +9244,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9399,6 +9481,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9885,7 +9968,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..2
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10015,15 @@ mod tests {
 
         // Do a partial compaction on key range 2..4
         tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10067,15 @@ mod tests {
 
         // Do a partial compaction on key range 4..9
         tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10118,15 @@ mod tests {
 
         // Do a partial compaction on key range 9..10
         tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10174,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc741826ab..fc69525bf4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
     Background,
 }
 
-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
@@ -777,6 +777,16 @@ pub(crate) enum CompactFlags {
     DryRun,
 }
 
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRequest {
+    pub compact_range: Option<CompactRange>,
+    pub compact_below_lsn: Option<Lsn>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRange {
@@ -786,10 +796,24 @@ pub(crate) struct CompactRange {
     pub end: Key,
 }
 
-#[derive(Clone, Default)]
+impl From<Range<Key>> for CompactRange {
+    fn from(range: Range<Key>) -> Self {
+        CompactRange {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction.
     pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the LSN below this value.
+    /// This option is only used by GC compaction.
+    pub compact_below_lsn: Option<Lsn>,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1604,6 +1628,7 @@ impl Timeline {
             CompactOptions {
                 flags,
                 compact_range: None,
+                compact_below_lsn: None,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ecd68ba55e..8ececa2bfb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,6 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -64,6 +63,12 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+/// A scheduled compaction task.
+pub struct ScheduledCompactionTask {
+    pub options: CompactOptions,
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+}
+
 pub struct GcCompactionJobDescription {
     /// All layers to read in the compaction job
     selected_layers: Vec<Layer>,
@@ -1746,24 +1751,6 @@ impl Timeline {
         Ok(())
     }
 
-    pub(crate) async fn compact_with_gc(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
-    }
-
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1771,17 +1758,19 @@ impl Timeline {
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
     ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
     /// Partial compaction will read and process all layers overlapping with the key range, even if it might
     /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
     /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
     /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
     /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
         cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
@@ -1803,6 +1792,12 @@ impl Timeline {
         )
         .await?;
 
+        let flags = options.flags;
+        let compaction_key_range = options
+            .compact_range
+            .map(|range| range.start..range.end)
+            .unwrap_or_else(|| Key::MIN..Key::MAX);
+
         let dry_run = flags.contains(CompactFlags::DryRun);
 
         if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1821,18 @@ impl Timeline {
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
             for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                 if lsn < &gc_cutoff {
                     retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1852,7 @@ impl Timeline {
                 .map(|desc| desc.get_lsn_range().end)
                 .max()
             else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                 return Ok(());
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +1875,7 @@ impl Timeline {
                 }
             }
             if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index b6741aed68..de6653eb3f 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -15,7 +15,7 @@ from fixtures.pageserver.http import PageserverApiException
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
-AGGRESIVE_COMPACTION_TENANT_CONF = {
+AGGRESSIVE_COMPACTION_TENANT_CONF = {
     # Disable gc and compaction. The test runs compaction manually.
     "gc_period": "0s",
     "compaction_period": "0s",
@@ -24,6 +24,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
     # Compact small layers
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 2,
+    # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
 
@@ -51,7 +52,7 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -120,14 +121,25 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    row_count = 1000
-    churn_rounds = 10
+    row_count = 10000
+    churn_rounds = 50
 
     ps_http = env.pageserver.http_client()
 
@@ -141,20 +153,27 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
-        workload.churn_rows(row_count, env.pageserver.id)
-        # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
-        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
-        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
         ps_http.timeline_compact(
             tenant_id,
             timeline_id,
             enhanced_gc_bottom_most_compaction=True,
             body={
-                "start": "000000000000000000000000000000000000",
-                "end": "030000000000000000000000000000000000",
+                "scheduled": True,
+                "compact_range": {
+                    "start": "000000000000000000000000000000000000",
+                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
+                    "end": "010000000000000000000000000000000000",
+                },
             },
         )
 
+        workload.churn_rows(row_count, env.pageserver.id)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
 

From 6331cb216195658b7926cadb8045759aa71c4575 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:42:52 -0600
Subject: [PATCH 070/117] Bump anyhow to 1.0.94 (#10028)

We were over a year out of date.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 62f06d45bd..f6e0024d87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -133,9 +133,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 dependencies = [
  "backtrace",
 ]

From 6ff4175fd7e62577ad0a7d1bba4fc3b6237ac764 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 14:30:35 -0600
Subject: [PATCH 071/117] Send Content-Type header on reconfigure request from
 neon_local (#10029)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 control_plane/src/endpoint.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 360857f365..35067c95b6 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -818,6 +819,7 @@ impl Endpoint {
                 self.http_address.ip(),
                 self.http_address.port()
             ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
             .body(format!(
                 "{{\"spec\":{}}}",
                 serde_json::to_string_pretty(&spec)?

From d1ab7471e2d6603a5680ba33f749adb743c2154b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:51:57 +0100
Subject: [PATCH 072/117] Fix desc_str for Azure container (#10021)

Small logs fix I've noticed while working on
https://github.com/neondatabase/cloud/issues/19963 .
---
 storage_scrubber/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fe4fc58cd..be526daaf0 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -268,7 +268,7 @@ impl BucketConfig {
                 config.bucket_name, config.bucket_region
             ),
             RemoteStorageKind::AzureContainer(config) => format!(
-                "bucket {}, storage account {:?}, region {}",
+                "container {}, storage account {:?}, region {}",
                 config.container_name, config.storage_account, config.container_region
             ),
         }

From 56f867bde5324b0d3333faaf7360aa07245f68c0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 08:22:22 +0100
Subject: [PATCH 073/117] pageserver: only zero truncated FSM page on owning
 shard (#10032)

## Problem

FSM pages are managed like regular relation pages, and owned by a single
shard. However, when truncating the FSM relation the last FSM page was
zeroed out on all shards. This is unnecessary and potentially confusing.

The superfluous keys will be removed during compactions, as they do not
belong on these shards.

Resolves #10027.

## Summary of changes

Only zero out the truncated FSM page on the owning shard.
---
 pageserver/src/walingest.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 93ae88936f..30c8965d51 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                 forknum: FSM_FORKNUM,
             };
 
+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
             let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
             let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                 modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
@@ -617,7 +620,7 @@ impl WalIngest {
             // tail bits in the last remaining map page, representing truncated heap
             // blocks, need to be cleared. This is not only tidy, but also necessary
             // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
             if (trunc_byte != 0 || trunc_offs != 0)
                 && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
             {
@@ -631,10 +634,9 @@ impl WalIngest {
                 )?;
                 vm_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }

From ec4072f84577eeb2a92d97fa77281efe50325730 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:12:39 +0100
Subject: [PATCH 074/117] pageserver: add `wait_until_flushed` parameter for
 timeline checkpoint (#10013)

## Problem

I'm writing an ingest benchmark in #9812. To time S3 uploads, I need to
schedule a flush of the Pageserver's in-memory layer, but don't actually
want to wait around for it to complete (which will take a minute).

## Summary of changes

Add a parameter `wait_until_flush` (default `true`) for
`timeline/checkpoint` to control whether to wait for the flush to
complete.
---
 pageserver/src/http/routes.rs           | 12 ++++++++----
 pageserver/src/tenant/timeline.rs       | 26 ++++++++++++++++---------
 test_runner/fixtures/pageserver/http.py |  5 ++++-
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b3981b4a8e..b7fddb065c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2148,16 +2148,20 @@ async fn timeline_checkpoint_handler(
     // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
     let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
 
+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                 match e {
                     tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                     other => ApiError::InternalServerError(other.into()),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc69525bf4..aab6703a3c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1457,23 +1457,31 @@ impl Timeline {
         Ok(lease)
     }
 
-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
         self.wait_flush_completion(token).await
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4cf3ece396..0832eac22f 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -850,6 +850,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         force_repartition=False,
         force_image_layer_creation=False,
         force_l0_compaction=False,
+        wait_until_flushed=True,
         wait_until_uploaded=False,
         compact: bool | None = None,
         **kwargs,
@@ -862,6 +863,8 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["force_image_layer_creation"] = "true"
         if force_l0_compaction:
             query["force_l0_compaction"] = "true"
+        if not wait_until_flushed:
+            query["wait_until_flushed"] = "false"
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
 
@@ -869,7 +872,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
             query["compact"] = "true" if compact else "false"
 
         log.info(
-            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+            f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
         )
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

From 3f1c5429577ca1dee8c5e0955e4072cee2a13eca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 10:21:52 +0000
Subject: [PATCH 075/117] pageserver: add disk consistent and remote lsn
 metrics (#10005)

## Problem

There's no metrics for disk consistent LSN and remote LSN. This stuff is
useful when looking at ingest performance.

## Summary of changes

Two per timeline metrics are added: `pageserver_disk_consistent_lsn` and
`pageserver_projected_remote_consistent_lsn`. I went for the projected
remote lsn instead of the visible one
because that more closely matches remote storage write tput. Ideally we
would have both, but these metrics are expensive.
---
 pageserver/src/metrics.rs                     | 46 +++++++++++++++++--
 .../src/tenant/remote_timeline_client.rs      |  3 ++
 pageserver/src/tenant/timeline.rs             |  8 +++-
 test_runner/fixtures/metrics.py               |  2 +
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 998c15ccaf..e3b6f43bc4 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_pitr_history_size",
@@ -2394,7 +2412,8 @@ pub(crate) struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
     pub(crate) layer_size_image: UIntGauge,
@@ -2475,7 +2494,11 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
@@ -2578,7 +2601,8 @@ impl TimelineMetrics {
             garbage_collect_histo,
             find_gc_cutoffs_histo,
             load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
             layer_size_image,
@@ -2642,6 +2666,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
@@ -2805,6 +2830,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -2819,6 +2845,10 @@ impl RemoteTimelineClientMetrics {
                 .unwrap(),
         );
 
+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
         RemoteTimelineClientMetrics {
             tenant_id: tenant_id_str,
             shard_id: shard_id_str,
@@ -2827,6 +2857,7 @@ impl RemoteTimelineClientMetrics {
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
         }
     }
 
@@ -3040,6 +3071,7 @@ impl Drop for RemoteTimelineClientMetrics {
             calls,
             bytes_started_counter,
             bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
         } = self;
         for ((a, b), _) in calls.get_mut().unwrap().drain() {
             let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3101,14 @@ impl Drop for RemoteTimelineClientMetrics {
             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 89b935947d..20e0536a00 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
                     upload_queue.clean.1 = Some(task.task_id);
 
                     let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
 
                     if self.generation.is_none() {
                         // Legacy mode: skip validating generation
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aab6703a3c..bf3d7a74a3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2392,7 +2392,7 @@ impl Timeline {
 
             result
                 .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                 .set(disk_consistent_lsn.0 as i64);
             result
         })
@@ -3514,7 +3514,7 @@ impl Timeline {
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
         self.last_record_lsn.advance(new_lsn);
     }
 
@@ -3882,6 +3882,10 @@ impl Timeline {
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
         let old_value = self.disk_consistent_lsn.fetch_max(new_value);
         assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
         new_value != old_value
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ffdbd988a5..1278ed1aef 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -152,6 +152,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_disk_consistent_lsn",
+    "pageserver_projected_remote_consistent_lsn",
     "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",

From 7838659197e40ecdb0735c01cb21dd2298492d24 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:24:13 +0100
Subject: [PATCH 076/117] pageserver: assert that keys belong to shard (#9943)

We've seen cases where stray keys end up on the wrong shard. This
shouldn't happen. Add debug assertions to prevent this. In release
builds, we should be lenient in order to handle changing key ownership
policies.

Touches #9914.
---
 libs/pageserver_api/src/shard.rs             |  5 +++--
 libs/utils/src/shard.rs                      |  6 ++++++
 pageserver/src/tenant/timeline.rs            | 19 ++++++++++++++++++-
 pageserver/src/tenant/timeline/compaction.rs | 16 +++++++++++-----
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a5c94a82c1..cf0cd3a46b 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
         key_to_shard_number(self.count, self.stripe_size, key)
     }
 
-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
     ///
     /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
         if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
             // Special keys that are only stored on shard 0
             false
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 782cddc599..6352ea9f92 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
     }
 }
 
+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bf3d7a74a3..0657d1af3a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
     postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -5924,6 +5924,23 @@ impl<'a> TimelineWriter<'a> {
             return Ok(());
         }
 
+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
         let batch_max_lsn = batch.max_lsn;
         let buf_size: u64 = batch.buffer_size() as u64;
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8ececa2bfb..7f86ede043 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1179,11 +1179,12 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 
             if !new_layers.is_empty() {
@@ -2054,6 +2055,11 @@ impl Timeline {
                 // This is not handled in the filter iterator because shard is determined by hash.
                 // Therefore, it does not give us any performance benefit to do things like skip
                 // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                 continue;
             }
             if !job_desc.compaction_key_range.contains(&key) {

From fa07097f2ff12b6560f4122e0654b24e5f9561e2 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 6 Dec 2024 12:44:50 +0100
Subject: [PATCH 077/117] chore: Reorganize and refresh CODEOWNERS (#10008)

## Problem

We didn't have a codeowner for `/compute`, so nobody was auto-assigned
for PRs like #9973

## Summary of changes

While on it:
1. Group codeowners into sections.
2. Remove control plane from the `/compute_tools` because it's primarily
the internal `compute_ctl` code.
3. Add control plane (and compute) to `/libs/compute_api` because that's
the shared public interface of the compute.
---
 CODEOWNERS | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f41462c98b..71b5e65f94 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,16 +1,29 @@
-/.github/ @neondatabase/developer-productivity
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
-/libs/proxy/ @neondatabase/proxy
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+# Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
-/pageserver/ @neondatabase/storage
+
+# DevProd
+/.github/ @neondatabase/developer-productivity
+
+# Compute
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/vendor/ @neondatabase/compute
+/compute/ @neondatabase/compute
+/compute_tools/ @neondatabase/compute
+
+# Proxy
+/libs/proxy/ @neondatabase/proxy
 /proxy/ @neondatabase/proxy
+
+# Storage
+/pageserver/ @neondatabase/storage
 /safekeeper/ @neondatabase/storage
 /storage_controller @neondatabase/storage
 /storage_scrubber @neondatabase/storage
-/vendor/ @neondatabase/compute
+/libs/pageserver_api/ @neondatabase/storage
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/storage
+
+# Shared
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage

From cc70fc802d2107122b330dba6ce8e2d8f8799189 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 12:51:41 +0000
Subject: [PATCH 078/117] pageserver: add metric for number of wal records
 received by each shard (#10035)

## Problem

With the current metrics we can't identify which shards are ingesting
data at any given time.

## Summary of changes

Add a metric for the number of wal records received for processing by
each shard. This is per (tenant, timeline, shard).
---
 pageserver/src/metrics.rs                     | 20 +++++++++++++++++++
 .../walreceiver/walreceiver_connection.rs     |  8 ++++++++
 test_runner/fixtures/metrics.py               |  1 +
 3 files changed, 29 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e3b6f43bc4..62bf9acf01 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2204,6 +2204,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
     .expect("failed to define a metric"),
 });
 
+pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_timeline_wal_records_received",
+        "Number of WAL records received per shard",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
@@ -2431,6 +2440,7 @@ pub(crate) struct TimelineMetrics {
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
     /// Number of valid LSN leases.
     pub valid_lsn_lease_count_gauge: UIntGauge,
+    pub wal_records_received: IntCounter,
     shutdown: std::sync::atomic::AtomicBool,
 }
 
@@ -2588,6 +2598,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         TimelineMetrics {
             tenant_id,
             shard_id,
@@ -2620,6 +2634,7 @@ impl TimelineMetrics {
                 evictions_with_low_residence_duration,
             ),
             valid_lsn_lease_count_gauge,
+            wal_records_received,
             shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
@@ -2757,6 +2772,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d90ffbfa2c..3f10eeda60 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                 for interpreted in records {
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
                         }
 
                         // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                         let ingested = walingest
                             .ingest_record(interpreted, &mut modification, &ctx)
                             .await
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 1278ed1aef..52ed7da36b 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -175,6 +175,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    counter("pageserver_timeline_wal_records_received"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From 14c4fae64af5613c682ec7dd7d30e484c476e5af Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 16:17:15 +0100
Subject: [PATCH 079/117] test_runner/performance: add improved bulk insert
 benchmark (#9812)

Adds an improved bulk insert benchmark, including S3 uploads.

Touches #9789.
---
 test_runner/fixtures/pageserver/utils.py      |  22 +--
 .../performance/test_ingest_insert_bulk.py    | 142 ++++++++++++++++++
 2 files changed, 149 insertions(+), 15 deletions(-)
 create mode 100644 test_runner/performance/test_ingest_insert_bulk.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 7c10edc5fc..66f61f9b4c 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -54,23 +54,15 @@ def wait_for_upload(
     tenant: TenantId | TenantShardId,
     timeline: TimelineId,
     lsn: Lsn,
+    timeout=20,
 ):
-    """waits for local timeline upload up to specified lsn"""
+    """Waits for local timeline upload up to specified LSN"""
 
-    current_lsn = Lsn(0)
-    for i in range(20):
-        current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
-        if current_lsn >= lsn:
-            log.info("wait finished")
-            return
-        lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
-        log.info(
-            f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
-    )
+    def is_uploaded():
+        remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
+        assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
+
+    wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
 
 
 def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
new file mode 100644
index 0000000000..283bcada31
--- /dev/null
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import random
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_for_upload_queue_empty,
+)
+from fixtures.remote_storage import s3_storage
+
+
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize("size", [8, 1024, 8192])
+@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
+@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
+@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
+def test_ingest_insert_bulk(
+    request: pytest.FixtureRequest,
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    fsync: bool,
+    backpressure: bool,
+    s3: bool,
+    size: int,
+):
+    """
+    Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
+    separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
+    isolation.
+    """
+
+    CONCURRENCY = 1  # 1 is optimal without fsync or backpressure
+    VOLUME = 5 * 1024**3
+    rows = VOLUME // (size + 64)  # +64 roughly accounts for per-row WAL overhead
+
+    neon_env_builder.safekeepers_enable_fsync = fsync
+
+    if s3:
+        neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+        # NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
+        # would compete with Pageserver for bandwidth.
+        # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+    neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            f"fsync = {fsync}",
+            "max_replication_apply_lag = 0",
+            f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
+            # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
+            f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
+        ],
+    )
+    endpoint.safe_psql("create extension neon")
+
+    # Wait for the timeline to be propagated to the pageserver.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Ingest rows.
+    log.info("Ingesting data")
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    def insert_rows(endpoint, table, count, value):
+        with endpoint.connect().cursor() as cur:
+            cur.execute("set statement_timeout = 0")
+            cur.execute(f"create table {table} (id int, data bytea)")
+            cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
+
+    with zenbenchmark.record_duration("upload"):
+        with zenbenchmark.record_duration("ingest"):
+            with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
+                for i in range(CONCURRENCY):
+                    # Write a random value for all rows. This is sufficient to prevent compression,
+                    # e.g. in TOAST. Randomly generating every row is too slow.
+                    value = random.randbytes(size)
+                    worker_rows = rows / CONCURRENCY
+                    pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
+
+        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+        # Wait for pageserver to ingest the WAL.
+        client = env.pageserver.http_client()
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+        # Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
+        client.timeline_checkpoint(
+            env.initial_tenant,
+            env.initial_timeline,
+            compact=False,
+            wait_until_flushed=False,
+        )
+        wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
+
+    # Empty out upload queue for next benchmark.
+    wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
+
+    backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
+
+    # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
+    # reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
+    # pageserver can ingest this WAL in isolation.
+    status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
+    assert status is not None
+
+    endpoint.stop()  # avoid spurious getpage errors
+    client.tenant_delete(env.initial_tenant)
+    env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
+
+    with zenbenchmark.record_duration("recover"):
+        log.info("Recovering WAL into pageserver")
+        client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+    # Emit metrics.
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+    zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
+    zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
+    zenbenchmark.record(
+        "backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
+    )
+
+    props = {p["name"]: p["value"] for _, p in request.node.user_properties}
+    for name in ("ingest", "upload", "recover"):
+        throughput = int(wal_written_mb / props[name])
+        zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
+
+    # Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
+    env.stop(immediate=True)

From e4837b0a5a65e8515949fad634d147cb2c2a8caf Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 6 Dec 2024 11:43:55 -0600
Subject: [PATCH 080/117] Bump sql_exporter to 0.16.0 (#10041)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      | 2 +-
 compute/compute-node.Dockerfile             | 2 +-
 test_runner/regress/test_compute_metrics.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 2671702697..fa84e467ad 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.13.1
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index bf6311bf2b..33d2a10285 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 1b15c5f15e..787790103f 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -215,7 +215,7 @@ if SQL_EXPORTER is None:
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port

From c42c28b339289a872400a4e9f0d1b4cc02048354 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 13:44:26 -0500
Subject: [PATCH 081/117] feat(pageserver): gc-compaction split job and partial
 scheduler (#9897)

## Problem

part of https://github.com/neondatabase/neon/issues/9114, stacked PR
over #9809

The compaction scheduler now schedules partial compaction jobs.

## Summary of changes

* Add the compaction job splitter based on size.
* Schedule subcompactions using the compaction scheduler.
* Test subcompaction scheduler in the smoke regress test.
* Temporarily disable layer map checks

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  10 +-
 pageserver/src/tenant.rs                     |  49 +++++-
 pageserver/src/tenant/timeline.rs            |   7 +
 pageserver/src/tenant/timeline/compaction.rs | 162 +++++++++++++++++--
 test_runner/regress/test_compaction.py       |   1 +
 5 files changed, 209 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b7fddb065c..0f11bbc507 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2036,15 +2036,23 @@ async fn timeline_compact_handler(
         parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
             .unwrap_or(false);
 
+    let sub_compaction = compact_request
+        .as_ref()
+        .map(|r| r.sub_compaction)
+        .unwrap_or(false);
     let options = CompactOptions {
         compact_range: compact_request
             .as_ref()
             .and_then(|r| r.compact_range.clone()),
         compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
+        sub_compaction,
     };
 
-    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+    let scheduled = compact_request
+        .as_ref()
+        .map(|r| r.scheduled)
+        .unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 306ec9f548..4a9c44aefd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -49,6 +49,7 @@ use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -2987,10 +2988,16 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let has_pending_scheduled_compaction_task;
+                    let mut has_pending_scheduled_compaction_task;
                     let next_scheduled_compaction_task = {
                         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                         if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            if !tline_pending_tasks.is_empty() {
+                                info!(
+                                    "{} tasks left in the compaction schedule queue",
+                                    tline_pending_tasks.len()
+                                );
+                            }
                             let next_task = tline_pending_tasks.pop_front();
                             has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
                             next_task
@@ -3007,6 +3014,32 @@ impl Tenant {
                             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
                         {
                             warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else if next_scheduled_compaction_task.options.sub_compaction {
+                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                            let jobs = timeline
+                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                                .await
+                                .map_err(CompactionError::Other)?;
+                            if jobs.is_empty() {
+                                info!("no jobs to run, skipping scheduled compaction task");
+                            } else {
+                                has_pending_scheduled_compaction_task = true;
+                                let jobs_len = jobs.len();
+                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
+                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    tline_pending_tasks.push_back(ScheduledCompactionTask {
+                                        options: job,
+                                        result_tx: if idx == jobs_len - 1 {
+                                            // The last compaction job sends the completion signal
+                                            next_scheduled_compaction_task.result_tx.take()
+                                        } else {
+                                            None
+                                        },
+                                    });
+                                }
+                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+                            }
                         } else {
                             let _ = timeline
                                 .compact_with_options(
@@ -9244,7 +9277,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9481,7 +9514,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9973,7 +10006,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(2)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10020,7 +10053,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(2)..get_key(4)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10072,7 +10105,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(4)..get_key(9)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10123,7 +10156,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(9)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10179,7 +10212,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0657d1af3a..8f1d5f6577 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -785,6 +785,9 @@ pub(crate) struct CompactRequest {
     /// Whether the compaction job should be scheduled.
     #[serde(default)]
     pub scheduled: bool,
+    /// Whether the compaction job should be split across key ranges.
+    #[serde(default)]
+    pub sub_compaction: bool,
 }
 
 #[serde_with::serde_as]
@@ -814,6 +817,9 @@ pub(crate) struct CompactOptions {
     /// If set, the compaction will only compact the LSN below this value.
     /// This option is only used by GC compaction.
     pub compact_below_lsn: Option<Lsn>,
+    /// Enable sub-compaction (split compaction job across key ranges).
+    /// This option is only used by GC compaction.
+    pub sub_compaction: bool,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1637,6 +1643,7 @@ impl Timeline {
                 flags,
                 compact_range: None,
                 compact_below_lsn: None,
+                sub_compaction: false,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7f86ede043..a18e157d37 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
+    ImageLayerCreationMode, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -29,7 +29,6 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -1752,6 +1751,116 @@ impl Timeline {
         Ok(())
     }
 
+    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
+    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
+    /// ad-hoc information about gc compaction itself.
+    pub(crate) async fn gc_compaction_split_jobs(
+        self: &Arc<Self>,
+        options: CompactOptions,
+    ) -> anyhow::Result<Vec<CompactOptions>> {
+        if !options.sub_compaction {
+            return Ok(vec![options]);
+        }
+        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
+            start: Key::MIN,
+            end: Key::MAX,
+        });
+        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
+            compact_below_lsn
+        } else {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.cutoffs.select_min() // use the real gc cutoff
+        };
+        let mut compact_jobs = Vec::new();
+        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
+        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
+        let Ok(partition) = self.partitioning.try_lock() else {
+            bail!("failed to acquire partition lock");
+        };
+        let ((dense_ks, sparse_ks), _) = &*partition;
+        // Truncate the key range to be within user specified compaction range.
+        fn truncate_to(
+            source_start: &Key,
+            source_end: &Key,
+            target_start: &Key,
+            target_end: &Key,
+        ) -> Option<(Key, Key)> {
+            let start = source_start.max(target_start);
+            let end = source_end.min(target_end);
+            if start < end {
+                Some((*start, *end))
+            } else {
+                None
+            }
+        }
+        let mut split_key_ranges = Vec::new();
+        let ranges = dense_ks
+            .parts
+            .iter()
+            .map(|partition| partition.ranges.iter())
+            .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
+            .flatten()
+            .cloned()
+            .collect_vec();
+        for range in ranges.iter() {
+            let Some((start, end)) = truncate_to(
+                &range.start,
+                &range.end,
+                &compact_range.start,
+                &compact_range.end,
+            ) else {
+                continue;
+            };
+            split_key_ranges.push((start, end));
+        }
+        split_key_ranges.sort();
+        let guard = self.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let mut current_start = None;
+        // Split compaction job to about 2GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
+        let ranges_num = split_key_ranges.len();
+        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
+            if current_start.is_none() {
+                current_start = Some(start);
+            }
+            let start = current_start.unwrap();
+            if start >= end {
+                // We have already processed this partition.
+                continue;
+            }
+            let res = layer_map.range_search(start..end, compact_below_lsn);
+            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
+                let mut compact_options = options.clone();
+                // Try to extend the compaction range so that we include at least one full layer file.
+                let extended_end = res
+                    .found
+                    .keys()
+                    .map(|layer| layer.layer.key_range.end)
+                    .min();
+                // It is possible that the search range does not contain any layer files when we reach the end of the loop.
+                // In this case, we simply use the specified key range end.
+                let end = if let Some(extended_end) = extended_end {
+                    extended_end.max(end)
+                } else {
+                    end
+                };
+                info!(
+                    "splitting compaction job: {}..{}, estimated_size={}",
+                    start, end, total_size
+                );
+                compact_options.compact_range = Some(CompactRange { start, end });
+                compact_options.compact_below_lsn = Some(compact_below_lsn);
+                compact_options.sub_compaction = false;
+                compact_jobs.push(compact_options);
+                current_start = Some(end);
+            }
+        }
+        drop(guard);
+        Ok(compact_jobs)
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1774,6 +1883,36 @@ impl Timeline {
         options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        if options.sub_compaction {
+            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs_len = jobs.len();
+            for (idx, job) in jobs.into_iter().enumerate() {
+                info!(
+                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
+                    idx + 1,
+                    jobs_len
+                );
+                self.compact_with_gc_inner(cancel, job, ctx).await?;
+            }
+            if jobs_len == 0 {
+                info!("no jobs to run, skipping gc bottom-most compaction");
+            }
+            return Ok(());
+        }
+        self.compact_with_gc_inner(cancel, options, ctx).await
+    }
+
+    async fn compact_with_gc_inner(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        options: CompactOptions,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        assert!(
+            !options.sub_compaction,
+            "sub-compaction should be handled by the outer function"
+        );
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1943,14 +2082,15 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        }
+        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
+        // let layer_names = job_desc
+        //     .selected_layers
+        //     .iter()
+        //     .map(|layer| layer.layer_desc().layer_name())
+        //     .collect_vec();
+        // if let Some(err) = check_valid_layermap(&layer_names) {
+        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+        // }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
             .selected_layers
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index de6653eb3f..e92dc47f39 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -159,6 +159,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
             enhanced_gc_bottom_most_compaction=True,
             body={
                 "scheduled": True,
+                "sub_compaction": True,
                 "compact_range": {
                     "start": "000000000000000000000000000000000000",
                     # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this

From b6eea655976ad7ebffd9b7edbf193850d2b2b05b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Dec 2024 22:56:57 +0200
Subject: [PATCH 082/117] Fix error message if PS connection is lost while
 receiving prefetch (#9923)

If the pageserver connection is lost while receiving the prefetch
request, the prefetch queue is cleared. The error message prints the
values from the prefetch slot, but because the slot was already cleared,
they're all zeros:

LOG: [NEON_SMGR] [shard 0] No response from reading prefetch entry 0:
0/0/0.0 block 0. This can be caused by a concurrent disconnect

To fix, make local copies of the values.

In the passing, also add a sanity check that if the receive() call
succeeds, the prefetch slot is still intact.
---
 pgxn/neon/pagestore_smgr.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a5e0c402fb..880c0de64e 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -610,6 +610,9 @@ prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
 	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;
 
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
@@ -623,11 +626,29 @@ prefetch_read(PrefetchRequest *slot)
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(slot->shard_no);
+	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -642,11 +663,15 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
 		return false;
 	}
 }

From b1fd086c0c974447376d23cd6e3baf4f8248a1ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:30:04 -0500
Subject: [PATCH 083/117] test(pageserver): disable gc_compaction smoke test
 for now (#10045)

## Problem

The test is flaky.

## Summary of changes

Disable the test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e92dc47f39..881503046c 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -121,6 +121,9 @@ page_cache_size=10
     assert vectored_average < 8
 
 
+@pytest.mark.skip(
+    "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
+)
 @skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     SMOKE_CONF = {

From 4d7111f240062e161af1c298ffc5c28b5ed695fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 7 Dec 2024 09:57:55 +0100
Subject: [PATCH 084/117] page_service: don't count time spent flushing towards
 smgr latency metrics (#10042)

## Problem

In #9962 I changed the smgr metrics to include time spent on flush.

It isn't under our (=storage team's) control how long that flush takes
because the client can stop reading requests.

## Summary of changes

Stop the timer as soon as we've buffered up the response in the
`pgb_writer`.

Track flush time in a separate metric.

---------

Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
---
 pageserver/src/metrics.rs       | 138 +++++++++++++++++++++++++++++---
 pageserver/src/page_service.rs  |  76 ++++++++++++------
 test_runner/fixtures/metrics.py |   1 +
 3 files changed, 179 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 62bf9acf01..96ee157856 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1223,31 +1223,60 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-pub(crate) struct SmgrOpTimer {
+pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
+pub(crate) struct SmgrOpTimerInner {
     global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
     per_timeline_latency_histo: Option<Histogram>,
 
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+
     start: Instant,
     throttled: Duration,
     op: SmgrQueryType,
 }
 
+pub(crate) struct SmgrOpFlushInProgress {
+    base: Instant,
+    global_micros: IntCounter,
+    per_timeline_micros: IntCounter,
+}
+
 impl SmgrOpTimer {
     pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
         let Some(throttle) = throttle else {
             return;
         };
-        self.throttled += *throttle;
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        inner.throttled += *throttle;
     }
-}
 
-impl Drop for SmgrOpTimer {
-    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            base: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
 
-        let elapsed = match elapsed.checked_sub(self.throttled) {
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+        let elapsed = now - inner.start;
+
+        let elapsed = match elapsed.checked_sub(inner.throttled) {
             Some(elapsed) => elapsed,
             None => {
                 use utils::rate_limit::RateLimit;
@@ -1258,9 +1287,9 @@ impl Drop for SmgrOpTimer {
                         })))
                     });
                 let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
+                let rate_limit = &mut guard[inner.op];
                 rate_limit.call(|| {
-                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
                 });
                 elapsed // un-throttled time, more info than just saturating to 0
             }
@@ -1268,10 +1297,54 @@ impl Drop for SmgrOpTimer {
 
         let elapsed = elapsed.as_secs_f64();
 
-        self.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+        inner.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
         }
+
+        Some((now, inner))
+    }
+}
+
+impl Drop for SmgrOpTimer {
+    fn drop(&mut self) {
+        self.smgr_op_end();
+    }
+}
+
+impl SmgrOpFlushInProgress {
+    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let mut fut = std::pin::pin!(fut);
+
+        let now = Instant::now();
+        // Whenever observe_guard gets called, or dropped,
+        // it adds the time elapsed since its last call to metrics.
+        // Last call is tracked in `now`.
+        let mut observe_guard = scopeguard::guard(
+            || {
+                let elapsed = now - self.base;
+                self.global_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.per_timeline_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.base = now;
+            },
+            |mut observe| {
+                observe();
+            },
+        );
+
+        loop {
+            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
+                Ok(v) => return v,
+                Err(_timeout) => {
+                    (*observe_guard)();
+                }
+            }
+        }
     }
 }
 
@@ -1302,6 +1375,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_getpage_latency: Histogram,
     global_batch_size: Histogram,
     per_timeline_batch_size: Histogram,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1464,6 +1539,26 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
         .set(value.try_into().unwrap());
 }
 
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros",
+        "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
+         If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
+         easily discoverable in monitoring. \
+         Hence, this is NOT a completion latency historgram.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros_global",
+        "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1504,6 +1599,12 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_flush_in_progress_micros =
+            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
+        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
@@ -1511,6 +1612,8 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_getpage_started,
             global_batch_size,
             per_timeline_batch_size,
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
         }
     }
     pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
@@ -1523,13 +1626,17 @@ impl SmgrQueryTimePerTimeline {
             None
         };
 
-        SmgrOpTimer {
+        SmgrOpTimer(Some(SmgrOpTimerInner {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
             op,
             throttled: Duration::ZERO,
-        }
+            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
+            per_timeline_flush_in_progress_micros: self
+                .per_timeline_flush_in_progress_micros
+                .clone(),
+        }))
     }
 
     pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2777,6 +2884,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7026df9527..97d94bbe7f 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1017,10 +1017,8 @@ impl PageServerHandler {
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        let mut timers: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
-            let response_msg = match handler_result {
+            let (response_msg, timer) = match handler_result {
                 Err(e) => match &e {
                     PageStreamError::Shutdown => {
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1042,66 @@ impl PageServerHandler {
                         span.in_scope(|| {
                             error!("error reading relation or page version: {full:#}")
                         });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
+                        (
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            }),
+                            None, // TODO: measure errors
+                        )
                     }
                 },
-                Ok((response_msg, timer)) => {
-                    // Extending the lifetime of the timers so observations on drop
-                    // include the flush time.
-                    timers.push(timer);
-                    response_msg
-                }
+                Ok((response_msg, timer)) => (response_msg, Some(timer)),
             };
 
+            //
             // marshal & transmit response message
+            //
+
             pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-        }
-        tokio::select! {
-            biased;
-            _ = cancel.cancelled() => {
-                // We were requested to shut down.
-                info!("shutdown request received in page handler");
-                return Err(QueryError::Shutdown)
-            }
-            res = pgb_writer.flush() => {
-                res?;
+
+            // We purposefully don't count flush time into the timer.
+            //
+            // The reason is that current compute client will not perform protocol processing
+            // if the postgres backend process is doing things other than `->smgr_read()`.
+            // This is especially the case for prefetch.
+            //
+            // If the compute doesn't read from the connection, eventually TCP will backpressure
+            // all the way into our flush call below.
+            //
+            // The timer's underlying metric is used for a storage-internal latency SLO and
+            // we don't want to include latency in it that we can't control.
+            // And as pointed out above, in this case, we don't control the time that flush will take.
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+
+            // what we want to do
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => {
+                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                }
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
             }
+            // and log the info! line inside the request span
+            .instrument(span.clone())
+            .await?;
         }
-        drop(timers);
         Ok(())
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 52ed7da36b..a591e088ef 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -176,6 +176,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
     counter("pageserver_timeline_wal_records_received"),
+    counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From ec790870d54aadd1ecc6e431c9049b489ba33cd1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 7 Dec 2024 13:05:09 +0000
Subject: [PATCH 085/117] storcon: automatically clear Pause/Stop scheduling
 policies to enable detaches (#10011)

## Problem

We saw a tenant get stuck when it had been put into Pause scheduling
mode to pin it to a pageserver, then it was left idle for a while and
the control plane tried to detach it.

Close: https://github.com/neondatabase/neon/issues/9957

## Summary of changes

- When changing policy to Detached or Secondary, set the scheduling
policy to Active.
- Add a test that exercises this
- When persisting tenant shards, set their `generation_pageserver` to
null if the placement policy is not Attached (this enables consistency
checks to work, and avoids leaving state in the DB that could be
confusing/misleading in future)
---
 libs/pageserver_api/src/controller_api.rs     | 11 ++++
 storage_controller/src/persistence.rs         |  9 ++++
 storage_controller/src/service.rs             | 39 +++++++++++++-
 .../regress/test_storage_controller.py        | 52 +++++++++++++++++++
 4 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 9a5ebc95bd..6839ef69f5 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     }
 }
 
+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
     // Normal mode: the tenant's scheduled locations may be updated at will, including
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 14cc51240d..7ca80c7dfe 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -636,6 +636,13 @@ impl Persistence {
                     .into_boxed(),
             };
 
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
+
             #[derive(AsChangeset)]
             #[diesel(table_name = crate::schema::tenant_shards)]
             struct ShardUpdate {
@@ -643,6 +650,7 @@ impl Persistence {
                 placement_policy: Option<String>,
                 config: Option<String>,
                 scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
             }
 
             let update = ShardUpdate {
@@ -655,6 +663,7 @@ impl Persistence {
                     .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
             };
 
             query.set(update).execute(conn)?;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 083c78233a..7e4ee53b4c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -513,6 +513,9 @@ struct ShardUpdate {
 
     /// If this is None, generation is not updated.
     generation: Option<Generation>,
+
+    /// If this is None, scheduling policy is not updated.
+    scheduling_policy: Option<ShardSchedulingPolicy>,
 }
 
 enum StopReconciliationsReason {
@@ -2376,6 +2379,23 @@ impl Service {
             }
         };
 
+        // Ordinarily we do not update scheduling policy, but when making major changes
+        // like detaching or demoting to secondary-only, we need to force the scheduling
+        // mode to Active, or the caller's expected outcome (detach it) will not happen.
+        let scheduling_policy = match req.config.mode {
+            LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                // Special case: when making major changes like detaching or demoting to secondary-only,
+                // we need to force the scheduling mode to Active, or nothing will happen.
+                Some(ShardSchedulingPolicy::Active)
+            }
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                // While attached, continue to respect whatever the existing scheduling mode is.
+                None
+            }
+        };
+
         let mut create = true;
         for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             // Saw an existing shard: this is not a creation
@@ -2401,6 +2421,7 @@ impl Service {
                 placement_policy: placement_policy.clone(),
                 tenant_config: req.config.tenant_conf.clone(),
                 generation: set_generation,
+                scheduling_policy,
             });
         }
 
@@ -2497,6 +2518,7 @@ impl Service {
                     placement_policy,
                     tenant_config,
                     generation,
+                    scheduling_policy,
                 } in &updates
                 {
                     self.persistence
@@ -2505,7 +2527,7 @@ impl Service {
                             Some(placement_policy.clone()),
                             Some(tenant_config.clone()),
                             *generation,
-                            None,
+                            *scheduling_policy,
                         )
                         .await?;
                 }
@@ -2521,6 +2543,7 @@ impl Service {
                         placement_policy,
                         tenant_config,
                         generation: update_generation,
+                        scheduling_policy,
                     } in updates
                     {
                         let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2539,6 +2562,10 @@ impl Service {
                             shard.generation = Some(generation);
                         }
 
+                        if let Some(scheduling_policy) = scheduling_policy {
+                            shard.set_scheduling_policy(scheduling_policy);
+                        }
+
                         shard.schedule(scheduler, &mut schedule_context)?;
 
                         let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -2992,9 +3019,17 @@ impl Service {
 
         let TenantPolicyRequest {
             placement,
-            scheduling,
+            mut scheduling,
         } = req;
 
+        if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
+            // When someone configures a tenant to detach, we force the scheduling policy to enable
+            // this to take effect.
+            if scheduling.is_none() {
+                scheduling = Some(ShardSchedulingPolicy::Active);
+            }
+        }
+
         self.persistence
             .update_tenant_shard(
                 TenantFilter::Tenant(tenant_id),
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index f878116d53..9f74dcccb9 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3230,3 +3230,55 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
         # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
         env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
         raise
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_detached_stopped(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_configs()
+    env.start()
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=1,
+    )
+
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+    # Disable scheduling: ordinarily this would prevent the tenant's configuration being
+    # reconciled to pageservers, but this should be overridden when detaching.
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {"scheduling": "Stop"},
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Confirm the detach happened
+    assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []

From 9d425b54f72f7baa23f63c09c464d18c10304fa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:46:59 +0100
Subject: [PATCH 086/117] Update AWS SDK crates (#10056)

Result of running:

cargo update -p aws-types -p aws-sigv4 -p aws-credential-types -p
aws-smithy-types -p aws-smithy-async -p aws-sdk-kms -p aws-sdk-iam -p
aws-sdk-s3 -p aws-config

We want to keep the AWS SDK up to date as that way we benefit from new
developments and improvements.
---
 Cargo.lock | 77 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f6e0024d87..a25c7585bb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -284,9 +284,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "1.5.1"
+version = "1.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ac9889352d632214df943e26740c46a0f3da6e329fbd28164fe7ae1b061da7b"
+checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
@@ -295,7 +295,7 @@ dependencies = [
  "aws-sdk-sts",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.60.7",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -304,7 +304,6 @@ dependencies = [
  "fastrand 2.2.0",
  "hex",
  "http 0.2.9",
- "hyper 0.14.30",
  "ring",
  "time",
  "tokio",
@@ -327,9 +326,9 @@ dependencies = [
 
 [[package]]
 name = "aws-runtime"
-version = "1.4.3"
+version = "1.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468"
+checksum = "b5ac934720fbb46206292d2c75b57e67acfc56fe7dfd34fb9a02334af08409ea"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -353,15 +352,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-iam"
-version = "1.46.0"
+version = "1.53.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "053df3024ea2ed0431359b3cddecc92dcfadeaedf71dd497292b39e37e597b46"
+checksum = "fb8a6fea8d335cde419176b1f2c6d2d6e97997719e7df4b51e59064310f48e4a"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -376,15 +375,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-kms"
-version = "1.47.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "564a597a3c71a957d60a2e4c62c93d78ee5a0d636531e15b760acad983a5c18e"
+checksum = "3c30f6fd5646b99d9b45ec3a0c22e67112c175b2383100c960d7ee39d96c8d96"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -398,11 +397,10 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "1.52.0"
+version = "1.65.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f571deb0a80c20d21d9f3e8418c1712af9ff4bf399d057e5549a934eca4844e2"
+checksum = "d3ba2c5c0f2618937ce3d4a5ad574b86775576fa24006bcb3128c6e2cbf3c34e"
 dependencies = [
- "ahash",
  "aws-credential-types",
  "aws-runtime",
  "aws-sigv4",
@@ -410,7 +408,7 @@ dependencies = [
  "aws-smithy-checksums",
  "aws-smithy-eventstream",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -433,15 +431,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sso"
-version = "1.30.0"
+version = "1.50.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebb97e44983752cf7e12968c5f569a5d7562dbbc67006755c331d9d9c99580ae"
+checksum = "05ca43a4ef210894f93096039ef1d6fa4ad3edfabb3be92b80908b9f2e4b4eab"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -455,15 +453,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-ssooidc"
-version = "1.31.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad061d977235898e4a97ecbd5d882786cca41b4828943584dc792dcc35eb3d3c"
+checksum = "abaf490c2e48eed0bb8e2da2fb08405647bd7f253996e0f93b981958ea0f73b0"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
  "aws-smithy-types",
@@ -477,15 +475,15 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "1.30.0"
+version = "1.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "300ce43d1f7f4eb023e57d38b0921d964e8e62bed7f82f6b7849e7eab7a14575"
+checksum = "b68fde0d69c8bfdc1060ea7da21df3e39f6014da316783336deff0a9ec28f4bf"
 dependencies = [
  "aws-credential-types",
  "aws-runtime",
  "aws-smithy-async",
  "aws-smithy-http",
- "aws-smithy-json",
+ "aws-smithy-json 0.61.1",
  "aws-smithy-query",
  "aws-smithy-runtime",
  "aws-smithy-runtime-api",
@@ -500,9 +498,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "1.2.4"
+version = "1.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68"
+checksum = "7d3820e0c08d0737872ff3c7c1f21ebbb6693d832312d6152bf18ef50a5471c2"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-eventstream",
@@ -540,9 +538,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.60.12"
+version = "0.60.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23"
+checksum = "ba1a71073fca26775c8b5189175ea8863afb1c9ea2cceb02a5de5ad9dfbaa795"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -600,6 +598,15 @@ dependencies = [
  "aws-smithy-types",
 ]
 
+[[package]]
+name = "aws-smithy-json"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee4e69cc50921eb913c6b662f8d909131bb3e6ad6cb6090d3a39b66fc5c52095"
+dependencies = [
+ "aws-smithy-types",
+]
+
 [[package]]
 name = "aws-smithy-query"
 version = "0.60.7"
@@ -612,9 +619,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime"
-version = "1.7.2"
+version = "1.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a065c0fe6fdbdf9f11817eb68582b2ab4aff9e9c39e986ae48f7ec576c6322db"
+checksum = "9f20685047ca9d6f17b994a07f629c813f08b5bce65523e47124879e60103d45"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -639,9 +646,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-runtime-api"
-version = "1.7.2"
+version = "1.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96"
+checksum = "92165296a47a812b267b4f41032ff8069ab7ff783696d217f0994a0d7ab585cd"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -656,9 +663,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "1.2.7"
+version = "1.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b"
+checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510"
 dependencies = [
  "base64-simd",
  "bytes",

From 4cca5cdb12a6c3c957163931c89a376b5f47f17b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 9 Dec 2024 14:57:42 +0000
Subject: [PATCH 087/117] deps: update url to 2.5.4 for RUSTSEC-2024-0421
 (#10059)

## Problem

See https://rustsec.org/advisories/RUSTSEC-2024-0421

## Summary of changes

Update url crate to 2.5.4.
---
 Cargo.lock                | 287 ++++++++++++++++++++++++++++++++++----
 deny.toml                 |   1 +
 workspace_hack/Cargo.toml |   3 +
 3 files changed, 267 insertions(+), 24 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a25c7585bb..e9004748ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2131,9 +2131,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
 [[package]]
 name = "form_urlencoded"
-version = "1.1.0"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
 dependencies = [
  "percent-encoding",
 ]
@@ -2754,6 +2754,124 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "icu_collections"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_locid_transform_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locid_transform_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
+
+[[package]]
+name = "icu_normalizer"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "utf16_iter",
+ "utf8_iter",
+ "write16",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
+
+[[package]]
+name = "icu_properties"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
+dependencies = [
+ "displaydoc",
+ "icu_collections",
+ "icu_locid_transform",
+ "icu_properties_data",
+ "icu_provider",
+ "tinystr",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
+
+[[package]]
+name = "icu_provider"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
+dependencies = [
+ "displaydoc",
+ "icu_locid",
+ "icu_provider_macros",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_provider_macros"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "ident_case"
 version = "1.0.1"
@@ -2762,12 +2880,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
 
 [[package]]
 name = "idna"
-version = "0.3.0"
+version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
+checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
 dependencies = [
- "unicode-bidi",
- "unicode-normalization",
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
 ]
 
 [[package]]
@@ -3076,6 +3205,12 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0b5399f6804fbab912acbd8878ed3532d506b7c951b8f9f164ef90fef39e3f4"
 
+[[package]]
+name = "litemap"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
+
 [[package]]
 name = "lock_api"
 version = "0.4.10"
@@ -4044,9 +4179,9 @@ dependencies = [
 
 [[package]]
 name = "percent-encoding"
-version = "2.2.0"
+version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
 [[package]]
 name = "petgraph"
@@ -4656,9 +4791,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
  "proc-macro2",
 ]
@@ -5706,9 +5841,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 
 [[package]]
 name = "serde"
-version = "1.0.203"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
@@ -5725,9 +5860,9 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.203"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6469,6 +6604,16 @@ dependencies = [
  "crunchy",
 ]
 
+[[package]]
+name = "tinystr"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@@ -6481,9 +6626,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -6997,21 +7142,21 @@ dependencies = [
 
 [[package]]
 name = "unicode-bidi"
-version = "0.3.13"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
+checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.9"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.22"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]
@@ -7062,9 +7207,9 @@ dependencies = [
 
 [[package]]
 name = "url"
-version = "2.3.1"
+version = "2.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
+checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
 dependencies = [
  "form_urlencoded",
  "idna",
@@ -7084,6 +7229,18 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 
+[[package]]
+name = "utf16_iter"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.1"
@@ -7676,8 +7833,10 @@ dependencies = [
  "der 0.7.8",
  "deranged",
  "digest",
+ "displaydoc",
  "either",
  "fail",
+ "form_urlencoded",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7726,6 +7885,7 @@ dependencies = [
  "signature 2.2.0",
  "smallvec",
  "spki 0.7.3",
+ "stable_deref_trait",
  "subtle",
  "syn 2.0.90",
  "sync_wrapper 0.1.2",
@@ -7750,6 +7910,18 @@ dependencies = [
  "zstd-sys",
 ]
 
+[[package]]
+name = "write16"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
+
+[[package]]
+name = "writeable"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
+
 [[package]]
 name = "x509-certificate"
 version = "0.23.1"
@@ -7810,6 +7982,30 @@ dependencies = [
  "time",
 ]
 
+[[package]]
+name = "yoke"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+ "synstructure",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.7.31"
@@ -7831,6 +8027,27 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "zerofrom"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+ "synstructure",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.7.0"
@@ -7852,6 +8069,28 @@ dependencies = [
  "syn 2.0.90",
 ]
 
+[[package]]
+name = "zerovec"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.90",
+]
+
 [[package]]
 name = "zstd"
 version = "0.13.0"
diff --git a/deny.toml b/deny.toml
index 7a1eecac99..ff8d71cda5 100644
--- a/deny.toml
+++ b/deny.toml
@@ -42,6 +42,7 @@ allow = [
     "MPL-2.0",
     "OpenSSL",
     "Unicode-DFS-2016",
+    "Unicode-3.0",
 ]
 confidence-threshold = 0.8
 exceptions = [
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index d19379aefd..33bdc25785 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -33,6 +33,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+form_urlencoded = { version = "1" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -78,6 +79,7 @@ sha2 = { version = "0.10", features = ["asm", "oid"] }
 signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
+stable_deref_trait = { version = "1" }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
@@ -105,6 +107,7 @@ anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
+displaydoc = { version = "0.2" }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 half = { version = "2", default-features = false, features = ["num-traits"] }

From e74e7aac936543e5fcb9d122bedc2d146e41ddb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:50:06 +0100
Subject: [PATCH 088/117] Use updated patched azure SDK crates (#10036)

For a while already, we've been unable to update the Azure SDK crates
due to Azure adopting use of a non-tokio async runtime, see #7545.

The effort to upstream the fix got stalled, and I think it's better to
switch to a patched version of the SDK that is up to date.

Now we have a fork of the SDK under the neondatabase github org, to
which I have applied Conrad's rebased patches to:
https://github.com/neondatabase/azure-sdk-for-rust/tree/neon .

The existence of a fork will also help with shipping bulk delete support
before it's upstreamed (#7931).

Also, in related news, the Azure SDK has gotten a rift in development,
where the main branch pertains to a future, to-be-officially-blessed
release of the SDK, and the older versions, which we are currently
using, are on the `legacy` branch. Upstream doesn't really want patches
for the `legacy` branch any more, they want to focus on the `main`
efforts. However, even then, the `legacy` branch is still newer than
what we are having right now, so let's switch to `legacy` for now.

Depending on how long it takes, we can switch to the official version of
the SDK once it's released or switch to the upstream `main` branch if
there is changes we want before that.

As a nice side effect of this PR, we now use reqwest 0.12 everywhere,
dropping the dependency on version 0.11.

Fixes #7545
---
 Cargo.lock                            | 151 +++++++-------------------
 Cargo.toml                            |  10 +-
 libs/remote_storage/src/azure_blob.rs |   8 +-
 3 files changed, 47 insertions(+), 122 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9004748ae..e2d5e03613 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -770,77 +770,74 @@ dependencies = [
 
 [[package]]
 name = "azure_core"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70fd680c0d0424a518229b1150922f92653ba2ac933aa000abc8bf1ca08105f7"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "async-trait",
- "base64 0.21.1",
+ "base64 0.22.1",
  "bytes",
  "dyn-clone",
  "futures",
  "getrandom 0.2.11",
  "hmac",
  "http-types",
- "log",
  "once_cell",
  "paste",
  "pin-project",
  "quick-xml 0.31.0",
  "rand 0.8.5",
- "reqwest 0.11.19",
+ "reqwest",
  "rustc_version",
  "serde",
  "serde_json",
  "sha2",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_identity"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6d2060f5b2e1c664026ca4edd561306c473be887c1f7a81f10bf06f9b71c63f"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "async-lock",
  "async-trait",
  "azure_core",
  "futures",
- "log",
  "oauth2",
  "pin-project",
  "serde",
  "time",
+ "tokio",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_storage"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15d3da73bfa09350e1bd6ae2a260806fcf90048c7e78cd2d8f88be60b19a7266"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "RustyXML",
  "async-lock",
  "async-trait",
  "azure_core",
  "bytes",
- "log",
  "serde",
  "serde_derive",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_storage_blobs"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "149c21834a4105d761e3dd33d91c2a3064acc05a3c978848ea8089102ae45c94"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "RustyXML",
  "azure_core",
@@ -848,20 +845,19 @@ dependencies = [
  "azure_svc_blobstorage",
  "bytes",
  "futures",
- "log",
  "serde",
  "serde_derive",
  "serde_json",
  "time",
+ "tracing",
  "url",
  "uuid",
 ]
 
 [[package]]
 name = "azure_svc_blobstorage"
-version = "0.19.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "88c888b7bf522d5405218b8613bf0fae7ddaae6ef3bf4ad42ae005993c96ab8b"
+version = "0.21.0"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#66e77bdd87bf87e773acf3b0c84b532c1124367d"
 dependencies = [
  "azure_core",
  "bytes",
@@ -1287,7 +1283,7 @@ dependencies = [
  "prometheus",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rlimit",
  "rust-ini",
  "serde",
@@ -1395,7 +1391,7 @@ dependencies = [
  "postgres_backend",
  "postgres_connection",
  "regex",
- "reqwest 0.12.4",
+ "reqwest",
  "safekeeper_api",
  "scopeguard",
  "serde",
@@ -1904,15 +1900,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "encoding_rs"
-version = "0.8.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071a31f4ee85403370b58aca746f01041ede6f0da2730960ad001edc2b71b394"
-dependencies = [
- "cfg-if",
-]
-
 [[package]]
 name = "enum-map"
 version = "2.5.0"
@@ -3709,7 +3696,7 @@ dependencies = [
  "bytes",
  "http 1.1.0",
  "opentelemetry",
- "reqwest 0.12.4",
+ "reqwest",
 ]
 
 [[package]]
@@ -3726,7 +3713,7 @@ dependencies = [
  "opentelemetry-proto",
  "opentelemetry_sdk",
  "prost",
- "reqwest 0.12.4",
+ "reqwest",
  "thiserror",
 ]
 
@@ -3935,7 +3922,7 @@ dependencies = [
  "range-set-blaze",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rpds",
  "scopeguard",
  "send-future",
@@ -3988,7 +3975,7 @@ dependencies = [
  "postgres_ffi",
  "rand 0.8.5",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "serde_json",
  "serde_with",
@@ -4008,7 +3995,7 @@ dependencies = [
  "futures",
  "pageserver_api",
  "postgres",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "thiserror",
  "tokio",
@@ -4725,7 +4712,7 @@ dependencies = [
  "redis",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "reqwest-retry",
  "reqwest-tracing",
@@ -5088,47 +5075,6 @@ dependencies = [
  "utils",
 ]
 
-[[package]]
-name = "reqwest"
-version = "0.11.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20b9b67e2ca7dd9e9f9285b759de30ff538aab981abaaf7bc9bd90b84a0126c3"
-dependencies = [
- "base64 0.21.1",
- "bytes",
- "encoding_rs",
- "futures-core",
- "futures-util",
- "h2 0.3.26",
- "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.30",
- "hyper-rustls 0.24.0",
- "ipnet",
- "js-sys",
- "log",
- "mime",
- "once_cell",
- "percent-encoding",
- "pin-project-lite",
- "rustls 0.21.12",
- "rustls-pemfile 1.0.2",
- "serde",
- "serde_json",
- "serde_urlencoded",
- "tokio",
- "tokio-rustls 0.24.0",
- "tokio-util",
- "tower-service",
- "url",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "wasm-streams 0.3.0",
- "web-sys",
- "webpki-roots 0.25.2",
- "winreg 0.50.0",
-]
-
 [[package]]
 name = "reqwest"
 version = "0.12.4"
@@ -5168,10 +5114,10 @@ dependencies = [
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
- "wasm-streams 0.4.0",
+ "wasm-streams",
  "web-sys",
  "webpki-roots 0.26.1",
- "winreg 0.52.0",
+ "winreg",
 ]
 
 [[package]]
@@ -5183,7 +5129,7 @@ dependencies = [
  "anyhow",
  "async-trait",
  "http 1.1.0",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "thiserror",
  "tower-service",
@@ -5202,7 +5148,7 @@ dependencies = [
  "http 1.1.0",
  "hyper 1.4.1",
  "parking_lot 0.11.2",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "retry-policies",
  "thiserror",
@@ -5223,7 +5169,7 @@ dependencies = [
  "http 1.1.0",
  "matchit 0.8.2",
  "opentelemetry",
- "reqwest 0.12.4",
+ "reqwest",
  "reqwest-middleware",
  "tracing",
  "tracing-opentelemetry",
@@ -5587,7 +5533,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "safekeeper_api",
  "scopeguard",
  "sd-notify",
@@ -5743,7 +5689,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.21.12",
  "sentry-backtrace",
  "sentry-contexts",
@@ -6198,7 +6144,7 @@ dependencies = [
  "postgres_connection",
  "r2d2",
  "rand 0.8.5",
- "reqwest 0.12.4",
+ "reqwest",
  "routerify",
  "scopeguard",
  "serde",
@@ -6218,7 +6164,7 @@ name = "storage_controller_client"
 version = "0.1.0"
 dependencies = [
  "pageserver_client",
- "reqwest 0.12.4",
+ "reqwest",
  "serde",
  "workspace_hack",
 ]
@@ -6245,7 +6191,7 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.23.18",
  "rustls-native-certs 0.8.0",
  "serde",
@@ -6274,7 +6220,7 @@ dependencies = [
  "humantime",
  "pageserver_api",
  "pageserver_client",
- "reqwest 0.12.4",
+ "reqwest",
  "serde_json",
  "storage_controller_client",
  "tokio",
@@ -7514,19 +7460,6 @@ version = "0.2.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
 
-[[package]]
-name = "wasm-streams"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7"
-dependencies = [
- "futures-util",
- "js-sys",
- "wasm-bindgen",
- "wasm-bindgen-futures",
- "web-sys",
-]
-
 [[package]]
 name = "wasm-streams"
 version = "0.4.0"
@@ -7792,16 +7725,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "winreg"
-version = "0.50.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
-dependencies = [
- "cfg-if",
- "windows-sys 0.48.0",
-]
-
 [[package]]
 name = "winreg"
 version = "0.52.0"
@@ -7876,7 +7799,7 @@ dependencies = [
  "regex",
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
- "reqwest 0.12.4",
+ "reqwest",
  "rustls 0.23.18",
  "scopeguard",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index a35823e0c2..0654c25a3d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
-azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
-azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
 postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
+## Azure SDK crates
+azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
+azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
+
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 8d1962fa29..a1d7569140 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,15 +8,14 @@ use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::Duration;
 use std::time::SystemTime;
 
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
+use anyhow::Context;
 use anyhow::Result;
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, RetryOptions};
-use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
@@ -76,8 +75,9 @@ impl AzureBlobStorage {
         let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
             StorageCredentials::access_key(account.clone(), access_key)
         } else {
-            let token_credential = DefaultAzureCredential::default();
-            StorageCredentials::token_credential(Arc::new(token_credential))
+            let token_credential = azure_identity::create_default_credential()
+                .context("trying to obtain Azure default credentials")?;
+            StorageCredentials::token_credential(token_credential)
         };
 
         // we have an outer retry

From 92273b6d5e4133e637d7b5341659007acd0933fb Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 9 Dec 2024 20:30:39 +0100
Subject: [PATCH 089/117] Enable the pg_regress tests on staging for PG17
 (#9978)

## Problem
Currently, we run the `pg_regress` tests only for PG16
However, PG17 is a part of Neon and should be tested as well
## Summary of changes
Modified the workflow and added a patch for PG17 enabling the
`pg_regress` tests.
The problem with leftovers was solved by using branches.
---
 .github/actionlint.yml                        |    2 +
 .github/workflows/cloud-regress.yml           |   32 +-
 compute/patches/cloud_regress_pg17.patch      | 4047 +++++++++++++++++
 test_runner/cloud_regress/README.md           |   21 +
 .../cloud_regress/test_cloud_regress.py       |   53 -
 5 files changed, 4096 insertions(+), 59 deletions(-)
 create mode 100644 compute/patches/cloud_regress_pg17.patch
 create mode 100644 test_runner/cloud_regress/README.md

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 29c4d18f4a..27c8fb3c23 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -21,3 +21,5 @@ config-variables:
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
   - DEV_AWS_OIDC_ROLE_ARN
   - BENCHMARK_INGEST_TARGET_PROJECTID
+  - PGREGRESS_PG16_PROJECT_ID
+  - PGREGRESS_PG17_PROJECT_ID
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index 19ebf457b8..57194090cf 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -23,11 +23,14 @@ jobs:
   regress:
     env:
       POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      DEFAULT_PG_VERSION: 16
       TEST_OUTPUT: /tmp/test_output
       BUILD_TYPE: remote
       AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
 
     runs-on: us-east-2
     container:
@@ -40,9 +43,11 @@ jobs:
           submodules: true
 
       - name: Patch the test
+        env:
+          PG_VERSION: ${{matrix.pg-version}}
         run: |
-          cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
-          patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
+          cd "vendor/postgres-v${PG_VERSION}"
+          patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"
 
       - name: Generate a random password
         id: pwgen
@@ -55,8 +60,9 @@ jobs:
       - name: Change tests according to the generated password
         env:
           DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
+          PG_VERSION: ${{matrix.pg-version}}
         run: |
-          cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
+          cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
           for fname in sql/*.sql expected/*.out; do
             sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
           done
@@ -73,15 +79,29 @@ jobs:
           path: /tmp/neon/
           prefix: latest
 
+      - name: Create a new branch
+        id: create-branch
+        uses: ./.github/actions/neon-branch-create
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
+
       - name: Run the regression tests
         uses: ./.github/actions/run-python-test-set
         with:
           build_type: ${{ env.BUILD_TYPE }}
           test_selection: cloud_regress
-          pg_version: ${{ env.DEFAULT_PG_VERSION }}
+          pg_version: ${{matrix.pg-version}}
           extra_params: -m remote_cluster
         env:
-          BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
+          BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
+
+      - name: Delete branch
+        uses: ./.github/actions/neon-branch-delete
+        with:
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+          project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
+          branch_id: ${{steps.create-branch.outputs.branch_id}}
 
       - name: Create Allure report
         id: create-allure-report
diff --git a/compute/patches/cloud_regress_pg17.patch b/compute/patches/cloud_regress_pg17.patch
new file mode 100644
index 0000000000..cbe84ef54b
--- /dev/null
+++ b/compute/patches/cloud_regress_pg17.patch
@@ -0,0 +1,4047 @@
+diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
+index 1c1ca7573a..6dfe537647 100644
+--- a/src/test/regress/expected/aggregates.out
++++ b/src/test/regress/expected/aggregates.out
+@@ -11,7 +11,8 @@ CREATE TABLE aggtest (
+ 	b			float4
+ );
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ ANALYZE aggtest;
+ SELECT avg(four) AS avg_1 FROM onek;
+        avg_1        
+diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
+index ae54cb254f..888e2ee8bc 100644
+--- a/src/test/regress/expected/alter_generic.out
++++ b/src/test/regress/expected/alter_generic.out
+@@ -15,9 +15,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user1;
+ DROP ROLE IF EXISTS regress_alter_generic_user2;
+ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ RESET client_min_messages;
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+ GRANT ALL ON SCHEMA alt_nsp1, alt_nsp2 TO public;
+@@ -370,7 +370,7 @@ ERROR:  STORAGE cannot be specified in ALTER OPERATOR FAMILY
+ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -382,7 +382,7 @@ ERROR:  current transaction is aborted, commands ignored until end of transactio
+ ROLLBACK;
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/expected/alter_operator.out b/src/test/regress/expected/alter_operator.out
+index 4217ba15de..d28e3ff86e 100644
+--- a/src/test/regress/expected/alter_operator.out
++++ b/src/test/regress/expected/alter_operator.out
+@@ -119,7 +119,7 @@ ERROR:  operator attribute "Restrict" not recognized
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+ ERROR:  must be owner of operator ===
+diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
+index 6de74a26a9..cd59809194 100644
+--- a/src/test/regress/expected/alter_table.out
++++ b/src/test/regress/expected/alter_table.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ --
+ -- add attribute
+ --
+@@ -3928,8 +3928,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ ERROR:  relation "nonexistent" does not exist
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out
+index a6d81fd5f9..afefd761cb 100644
+--- a/src/test/regress/expected/arrays.out
++++ b/src/test/regress/expected/arrays.out
+@@ -18,7 +18,8 @@ CREATE TABLE array_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ --
+ -- only the 'e' array is 0-based, the others are 1-based.
+diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out
+index 510646cbce..0b3ca1f720 100644
+--- a/src/test/regress/expected/btree_index.out
++++ b/src/test/regress/expected/btree_index.out
+@@ -20,13 +20,17 @@ CREATE TABLE bt_f8_heap (
+ 	random 		int4
+ );
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+ ANALYZE bt_txt_heap;
+diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out
+index a13aafff0b..f0289b5c06 100644
+--- a/src/test/regress/expected/cluster.out
++++ b/src/test/regress/expected/cluster.out
+@@ -308,7 +308,7 @@ WHERE pg_class.oid=indexrelid
+ -- Verify that toast tables are clusterable
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -499,7 +499,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ SET SESSION AUTHORIZATION regress_ptnowner;
+diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
+index 7a425afe1f..2756fb2d55 100644
+--- a/src/test/regress/expected/collate.icu.utf8.out
++++ b/src/test/regress/expected/collate.icu.utf8.out
+@@ -1016,7 +1016,7 @@ select * from collate_test1 where b ilike 'ABC';
+ 
+ reset enable_seqscan;
+ -- schema manipulation commands
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ -- We need to do this this way to cope with varying names for encodings:
+ SET client_min_messages TO WARNING;
+diff --git a/src/test/regress/expected/constraints.out b/src/test/regress/expected/constraints.out
+index cf0b80d616..e8e2a14a4a 100644
+--- a/src/test/regress/expected/constraints.out
++++ b/src/test/regress/expected/constraints.out
+@@ -349,7 +349,8 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CONSTRAINT COPY_CON
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ SELECT * FROM COPY_TBL;
+  x |       y       | z 
+ ---+---------------+---
+@@ -358,7 +359,8 @@ SELECT * FROM COPY_TBL;
+ (2 rows)
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ ERROR:  new row for relation "copy_tbl" violates check constraint "copy_con"
+ DETAIL:  Failing row contains (7, check failed, 6).
+ CONTEXT:  COPY copy_tbl, line 2: "7	check failed	6"
+@@ -799,7 +801,7 @@ DETAIL:  Key (f1)=(3) conflicts with key (f1)=(3).
+ DROP TABLE deferred_excl;
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+ CREATE DOMAIN constraint_comments_dom AS int CONSTRAINT the_constraint CHECK (value > 0);
+@@ -819,7 +821,7 @@ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS NULL;
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ ERROR:  must be owner of relation constraint_comments_tbl
+diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
+index 442e7aff2b..525f732b03 100644
+--- a/src/test/regress/expected/conversion.out
++++ b/src/test/regress/expected/conversion.out
+@@ -8,7 +8,7 @@
+ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out
+index 44114089a6..fc1894a0f2 100644
+--- a/src/test/regress/expected/copy.out
++++ b/src/test/regress/expected/copy.out
+@@ -15,9 +15,11 @@ insert into copytest values('Unix',E'abc\ndef',2);
+ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ create temp table copytest2 (like copytest);
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -25,8 +27,10 @@ select * from copytest except select * from copytest2;
+ 
+ truncate copytest2;
+ --- same test but with an escape char different from quote char
+-copy copytest to :'filename' csv quote '''' escape E'\\';
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ select * from copytest except select * from copytest2;
+  style | test | filler 
+ -------+------+--------
+@@ -66,13 +70,16 @@ insert into parted_copytest select x,1,'One' from generate_series(1,1000) x;
+ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ truncate parted_copytest;
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ ERROR:  cannot perform COPY FREEZE on a partitioned table
+ rollback;
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -94,7 +101,8 @@ create trigger part_ins_trig
+ 	before insert on parted_copytest_a2
+ 	for each row
+ 	execute procedure part_ins_func();
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+       tableoid      | count |  sum   
+@@ -106,7 +114,8 @@ group by tableoid order by tableoid::regclass::name;
+ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ -- Ensure index entries were properly added during the copy.
+ select * from parted_copytest where b = 1;
+  a | b |  c   
+@@ -170,9 +179,9 @@ INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progre
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
+-INFO:  progress: {"type": "FILE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": true, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
++INFO:  progress: {"type": "PIPE", "command": "COPY FROM", "relname": "tab_progress_reporting", "has_bytes_total": false, "tuples_excluded": 1, "tuples_processed": 2, "has_bytes_processed": true}
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+ drop table tab_progress_reporting;
+@@ -281,7 +290,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+ -- does so when shared_buffers is small enough.  To test if we encountered the
+diff --git a/src/test/regress/expected/copy2.out b/src/test/regress/expected/copy2.out
+index 695b1b2d63..9c9addead6 100644
+--- a/src/test/regress/expected/copy2.out
++++ b/src/test/regress/expected/copy2.out
+@@ -631,8 +631,8 @@ select * from check_con_tbl;
+ (2 rows)
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ COPY rls_t1 (a, b, c) from stdin;
+ CREATE POLICY p1 ON rls_t1 FOR SELECT USING (a % 2 = 0);
+diff --git a/src/test/regress/expected/create_function_sql.out b/src/test/regress/expected/create_function_sql.out
+index 50aca5940f..42527142f6 100644
+--- a/src/test/regress/expected/create_function_sql.out
++++ b/src/test/regress/expected/create_function_sql.out
+@@ -4,7 +4,7 @@
+ -- Assorted tests using SQL-language functions
+ --
+ -- All objects made in this test are in temp_func_test schema
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+ SET search_path TO temp_func_test, public;
+diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
+index cf6eac5734..3e56ea09d7 100644
+--- a/src/test/regress/expected/create_index.out
++++ b/src/test/regress/expected/create_index.out
+@@ -51,7 +51,8 @@ CREATE TABLE fast_emp4000 (
+ 	home_base	 box
+ );
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ ANALYZE slow_emp4000;
+ ANALYZE fast_emp4000;
+@@ -655,7 +656,8 @@ CREATE TABLE array_index_op_test (
+ 	t			text[]
+ );
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+  seqno |   i    |   t    
+@@ -2966,7 +2968,7 @@ END;
+ -- concurrently
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ ERROR:  must be owner of schema schema_to_reindex
+diff --git a/src/test/regress/expected/create_procedure.out b/src/test/regress/expected/create_procedure.out
+index 2177ba3509..ae3ca94d00 100644
+--- a/src/test/regress/expected/create_procedure.out
++++ b/src/test/regress/expected/create_procedure.out
+@@ -421,7 +421,7 @@ ERROR:  cp_testfunc1(integer) is not a procedure
+ DROP PROCEDURE nonexistent();
+ ERROR:  procedure nonexistent() does not exist
+ -- privileges
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out
+index 46d4f9efe9..fc2a28a2f6 100644
+--- a/src/test/regress/expected/create_role.out
++++ b/src/test/regress/expected/create_role.out
+@@ -1,28 +1,28 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the SUPERUSER attribute may create roles with the SUPERUSER attribute.
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_replication REPLICATION;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the REPLICATION attribute may create roles with the REPLICATION attribute.
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the BYPASSRLS attribute may create roles with the BYPASSRLS attribute.
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to create role
+ DETAIL:  Only roles with the CREATEDB attribute may create roles with the CREATEDB attribute.
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+ ERROR:  permission denied to alter role
+@@ -39,10 +39,10 @@ DETAIL:  Only roles with the BYPASSRLS attribute may change the BYPASSRLS attrib
+ DROP ROLE regress_role_limited;
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+ ALTER ROLE regress_replication REPLICATION;
+@@ -58,48 +58,48 @@ ALTER ROLE regress_createdb NOSUPERUSER;
+ ERROR:  permission denied to alter role
+ DETAIL:  Only roles with the SUPERUSER attribute may change the SUPERUSER attribute.
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ NOTICE:  SYSID can no longer be specified
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  permission denied to grant role "regress_role_super"
+ DETAIL:  Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute.
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "pg_database_owner" cannot have explicit members
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_recursive" is a member of role "regress_nosuch_recursive"
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ERROR:  role "regress_nosuch_admin_recursive" is a member of role "regress_nosuch_admin_recursive"
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ ERROR:  permission denied to create database
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+ ALTER ROLE regress_hasprivs RENAME TO regress_tenant;
+@@ -141,7 +141,7 @@ ERROR:  permission denied to reassign objects
+ DETAIL:  Only roles with privileges of role "regress_tenant" may reassign objects owned by it.
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ -- ok, regress_tenant2 can create objects within the database
+ SET SESSION AUTHORIZATION regress_tenant2;
+@@ -165,34 +165,34 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ ERROR:  must be able to SET ROLE "regress_tenant2"
+ DROP TABLE tenant2_table;
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
+ ERROR:  permission denied to grant role "pg_read_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_data" may grant this role.
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
+ ERROR:  permission denied to grant role "pg_write_all_data"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_all_data" may grant this role.
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
+ ERROR:  permission denied to grant role "pg_monitor"
+ DETAIL:  Only roles with the ADMIN option on role "pg_monitor" may grant this role.
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
+ ERROR:  permission denied to grant role "pg_read_all_settings"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role.
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
+ ERROR:  permission denied to grant role "pg_read_all_stats"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role.
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
+ ERROR:  permission denied to grant role "pg_stat_scan_tables"
+ DETAIL:  Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role.
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
+ ERROR:  permission denied to grant role "pg_read_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_read_server_files" may grant this role.
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
+ ERROR:  permission denied to grant role "pg_write_server_files"
+ DETAIL:  Only roles with the ADMIN option on role "pg_write_server_files" may grant this role.
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
+ ERROR:  permission denied to grant role "pg_execute_server_program"
+ DETAIL:  Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role.
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ ERROR:  permission denied to grant role "pg_signal_backend"
+ DETAIL:  Only roles with the ADMIN option on role "pg_signal_backend" may grant this role.
+ -- fail, role still owns database objects
+diff --git a/src/test/regress/expected/create_schema.out b/src/test/regress/expected/create_schema.out
+index 93302a07ef..1a73f083ac 100644
+--- a/src/test/regress/expected/create_schema.out
++++ b/src/test/regress/expected/create_schema.out
+@@ -2,7 +2,7 @@
+ -- CREATE_SCHEMA
+ --
+ -- Schema creation with elements.
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+ -- This checks all the object types that include schema qualifications.
+diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
+index f551624afb..57f1e432d4 100644
+--- a/src/test/regress/expected/create_view.out
++++ b/src/test/regress/expected/create_view.out
+@@ -18,7 +18,8 @@ CREATE TABLE real_city (
+ 	outline 	path
+ );
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ SELECT *
+    INTO TABLE ramp
+diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
+index 454db91ec0..01378d7081 100644
+--- a/src/test/regress/expected/database.out
++++ b/src/test/regress/expected/database.out
+@@ -1,8 +1,7 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
++WARNING:  you need to manually restart any running background workers after this command
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+ BEGIN;
+diff --git a/src/test/regress/expected/dependency.out b/src/test/regress/expected/dependency.out
+index 74d9ff2998..fad0151614 100644
+--- a/src/test/regress/expected/dependency.out
++++ b/src/test/regress/expected/dependency.out
+@@ -1,10 +1,10 @@
+ --
+ -- DEPENDENCIES
+ --
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ GRANT SELECT ON TABLE deptest TO GROUP regress_dep_group;
+ GRANT ALL ON TABLE deptest TO regress_dep_user, regress_dep_user2;
+@@ -41,9 +41,9 @@ ERROR:  role "regress_dep_user3" cannot be dropped because some objects depend o
+ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/expected/drop_if_exists.out b/src/test/regress/expected/drop_if_exists.out
+index 5e44c2c3ce..eb3bb329fb 100644
+--- a/src/test/regress/expected/drop_if_exists.out
++++ b/src/test/regress/expected/drop_if_exists.out
+@@ -64,9 +64,9 @@ ERROR:  type "test_domain_exists" does not exist
+ ---
+ --- role/user/group
+ ---
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP USER regress_test_u2;
+ ERROR:  role "regress_test_u2" does not exist
+ DROP USER IF EXISTS regress_test_u1, regress_test_u2;
+diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out
+index 126f7047fe..0e2cc73426 100644
+--- a/src/test/regress/expected/equivclass.out
++++ b/src/test/regress/expected/equivclass.out
+@@ -384,7 +384,7 @@ set enable_nestloop = on;
+ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ -- without any RLS, we'll treat {a.ff, b.ff, 43} as an EquivalenceClass
+diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
+index 7b2198eac6..39919697ad 100644
+--- a/src/test/regress/expected/event_trigger.out
++++ b/src/test/regress/expected/event_trigger.out
+@@ -85,7 +85,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ -- OK
+ comment on event trigger regress_event_trigger is 'test comment';
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
+index 6ed50fdcfa..caa00a345d 100644
+--- a/src/test/regress/expected/foreign_data.out
++++ b/src/test/regress/expected/foreign_data.out
+@@ -14,13 +14,13 @@ CREATE FUNCTION test_fdw_handler()
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_role2, regress_test_role_super, regress_test_indirect, regress_unprivileged_role;
+ RESET client_min_messages;
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+ CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
+diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
+index 69994c98e3..129abcfbe8 100644
+--- a/src/test/regress/expected/foreign_key.out
++++ b/src/test/regress/expected/foreign_key.out
+@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
+ ERROR:  cannot ALTER TABLE "fk_partitioned_pk_61" because it is being used by active queries in this session
+ DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/expected/generated.out b/src/test/regress/expected/generated.out
+index 499072e14c..bd7a8b3f18 100644
+--- a/src/test/regress/expected/generated.out
++++ b/src/test/regress/expected/generated.out
+@@ -534,7 +534,7 @@ CREATE TABLE gtest10a (a int PRIMARY KEY, b int GENERATED ALWAYS AS (a * 2) STOR
+ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+ GRANT SELECT (a, c) ON gtest11s TO regress_user11;
+diff --git a/src/test/regress/expected/guc.out b/src/test/regress/expected/guc.out
+index 455b6d6c0c..12fa350c6d 100644
+--- a/src/test/regress/expected/guc.out
++++ b/src/test/regress/expected/guc.out
+@@ -584,7 +584,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out
+index 0d4bdb2ade..9a5a9b5407 100644
+--- a/src/test/regress/expected/hash_index.out
++++ b/src/test/regress/expected/hash_index.out
+@@ -20,10 +20,14 @@ CREATE TABLE hash_f8_heap (
+ 	random 		float8
+ );
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+ -- this is therefore a stress test of the bucket overflow code (unlike
+diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
+index f14bfccfb1..bbb2092df9 100644
+--- a/src/test/regress/expected/identity.out
++++ b/src/test/regress/expected/identity.out
+@@ -520,7 +520,7 @@ ALTER TABLE itest7 ALTER COLUMN a SET GENERATED BY DEFAULT;
+ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
+index 85240a9b0b..5294f7557d 100644
+--- a/src/test/regress/expected/inherit.out
++++ b/src/test/regress/expected/inherit.out
+@@ -2055,8 +2055,8 @@ NOTICE:  drop cascades to table cnullchild
+ --
+ -- Mixed ownership inheritance tree
+ --
+-create role regress_alice;
+-create role regress_bob;
++create role regress_alice password NEON_PASSWORD_PLACEHOLDER;
++create role regress_bob password NEON_PASSWORD_PLACEHOLDER;
+ grant all on schema public to regress_alice, regress_bob;
+ grant regress_alice to regress_bob;
+ set session authorization regress_alice;
+@@ -2789,7 +2789,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
+index cf4b5221a8..fa6ccb639c 100644
+--- a/src/test/regress/expected/insert.out
++++ b/src/test/regress/expected/insert.out
+@@ -802,7 +802,7 @@ drop table mlparted5;
+ -- appropriate key description (or none) in various situations
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ set role regress_insert_other_user;
+@@ -914,7 +914,7 @@ DETAIL:  Failing row contains (2, hi there).
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
+index e66d760189..86348fd416 100644
+--- a/src/test/regress/expected/jsonb.out
++++ b/src/test/regress/expected/jsonb.out
+@@ -4,7 +4,8 @@ CREATE TABLE testjsonb (
+        j jsonb
+ );
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+  jsonb 
+diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out
+index 4921dd79ae..d18a3cdd66 100644
+--- a/src/test/regress/expected/largeobject.out
++++ b/src/test/regress/expected/largeobject.out
+@@ -7,7 +7,7 @@
+ -- ensure consistent test output regardless of the default bytea format
+ SET bytea_output TO escape;
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+  lo_create 
+ -----------
+@@ -346,7 +346,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ 
+ TRUNCATE lotest_stash_values;
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+ -- verify length of large object
+@@ -410,12 +411,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ 
+ END;
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+- lo_export 
+------------
+-         1
+-(1 row)
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ \set newloid :LASTOID
+ -- just make sure \lo_export does not barf
+diff --git a/src/test/regress/expected/lock.out b/src/test/regress/expected/lock.out
+index ad137d3645..8dac447436 100644
+--- a/src/test/regress/expected/lock.out
++++ b/src/test/regress/expected/lock.out
+@@ -16,7 +16,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ -- Try all valid lock options; also try omitting the optional TABLE keyword.
+diff --git a/src/test/regress/expected/matview.out b/src/test/regress/expected/matview.out
+index 038ab73517..bd471f9fac 100644
+--- a/src/test/regress/expected/matview.out
++++ b/src/test/regress/expected/matview.out
+@@ -549,7 +549,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ NOTICE:  drop cascades to materialized view mvtest_mv_v
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -617,7 +617,7 @@ HINT:  Use the REFRESH MATERIALIZED VIEW command.
+ ROLLBACK;
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/expected/merge.out b/src/test/regress/expected/merge.out
+index 521d70a891..7fd218f3d8 100644
+--- a/src/test/regress/expected/merge.out
++++ b/src/test/regress/expected/merge.out
+@@ -1,9 +1,9 @@
+ --
+ -- MERGE
+ --
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ DROP TABLE IF EXISTS target;
+ NOTICE:  table "target" does not exist, skipping
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/expected/misc.out b/src/test/regress/expected/misc.out
+index 6e816c57f1..6ef45b468e 100644
+--- a/src/test/regress/expected/misc.out
++++ b/src/test/regress/expected/misc.out
+@@ -59,9 +59,11 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+  unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 
+ ---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------
+@@ -73,9 +75,11 @@ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ (0 rows)
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ SELECT * FROM stud_emp_copy;
+  name  | age |  location  | salary | manager | gpa | percent 
+ -------+-----+------------+--------+---------+-----+---------
+diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out
+index d94056862a..f8270d8343 100644
+--- a/src/test/regress/expected/misc_functions.out
++++ b/src/test/regress/expected/misc_functions.out
+@@ -297,7 +297,7 @@ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+  t
+ (1 row)
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+  has_function_privilege 
+@@ -483,7 +483,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+@@ -671,7 +671,7 @@ FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1),
+ (1 row)
+ 
+ -- pg_current_logfile
+-CREATE ROLE regress_current_logfile;
++CREATE ROLE regress_current_logfile PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- not available by default
+ SELECT has_function_privilege('regress_current_logfile',
+   'pg_current_logfile()', 'EXECUTE');
+diff --git a/src/test/regress/expected/multirangetypes.out b/src/test/regress/expected/multirangetypes.out
+index c6363ebeb2..8f43732404 100644
+--- a/src/test/regress/expected/multirangetypes.out
++++ b/src/test/regress/expected/multirangetypes.out
+@@ -3118,7 +3118,7 @@ drop type textrange2;
+ -- Multiranges don't have their own ownership or permissions.
+ --
+ create type textrange1 as range(subtype=text, multirange_type_name=multitextrange1, collation="C");
+-create role regress_multirange_owner;
++create role regress_multirange_owner password NEON_PASSWORD_PLACEHOLDER;
+ alter type multitextrange1 owner to regress_multirange_owner;  -- fail
+ ERROR:  cannot alter multirange type multitextrange1
+ HINT:  You can alter type textrange1, which will alter the multirange type as well.
+diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
+index fc42d418bf..e38f517574 100644
+--- a/src/test/regress/expected/object_address.out
++++ b/src/test/regress/expected/object_address.out
+@@ -5,7 +5,7 @@
+ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+ SET search_path TO 'addr_nsp';
+diff --git a/src/test/regress/expected/password.out b/src/test/regress/expected/password.out
+index 924d6e001d..5966531db6 100644
+--- a/src/test/regress/expected/password.out
++++ b/src/test/regress/expected/password.out
+@@ -12,13 +12,13 @@ SET password_encryption = 'md5'; -- ok
+ SET password_encryption = 'scram-sha-256'; -- ok
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1;
+-ALTER ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2;
+-ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- check list of created entries
+ --
+ -- The scram secret will look something like:
+@@ -32,10 +32,10 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5783277baca28003b33453252be4dbb34
+- regress_passwd2 | md54044304ba511dd062133eb5b4b84a2a3
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd4 | 
++ regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+ (4 rows)
+ 
+ -- Rename a role
+@@ -56,24 +56,30 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ -- passwords.
+ SET password_encryption = 'md5';
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- This looks like a valid SCRAM-SHA-256 secret, but it is not
+ -- so it should be hashed with SCRAM-SHA-256.
+ CREATE ROLE regress_passwd6 PASSWORD 'SCRAM-SHA-256$1234';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- These may look like valid MD5 secrets, but they are not, so they
+ -- should be hashed with SCRAM-SHA-256.
+ -- trailing garbage at the end
+ CREATE ROLE regress_passwd7 PASSWORD 'md5012345678901234567890123456789zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- invalid length
+ CREATE ROLE regress_passwd8 PASSWORD 'md501234567890123456789012345678901zz';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Changing the SCRAM iteration count
+ SET scram_iterations = 1024;
+ CREATE ROLE regress_passwd9 PASSWORD 'alterediterationcount';
+@@ -83,63 +89,67 @@ SELECT rolname, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+
+     ORDER BY rolname, rolpassword;
+      rolname     |                rolpassword_masked                 
+ -----------------+---------------------------------------------------
+- regress_passwd1 | md5cd3578025fe2c3d7ed1b9a9b26238b70
+- regress_passwd2 | md5dfa155cadd5f4ad57860162f3fab9cdb
++ regress_passwd1 | NEON_MD5_PLACEHOLDER_regress_passwd1
++ regress_passwd2 | NEON_MD5_PLACEHOLDER_regress_passwd2
+  regress_passwd3 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd4 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd5 | md5e73a4b11df52a6068f8b39f90be36023
+- regress_passwd6 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd7 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+- regress_passwd8 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey>
+  regress_passwd9 | SCRAM-SHA-256$1024:<salt>$<storedkey>:<serverkey>
+-(9 rows)
++(5 rows)
+ 
+ -- An empty password is not allowed, in any form
+ CREATE ROLE regress_passwd_empty PASSWORD '';
+ NOTICE:  empty string is not a valid password, clearing password
++ERROR:  Failed to get encrypted password: User "regress_passwd_empty" has no password assigned.
+ ALTER ROLE regress_passwd_empty PASSWORD 'md585939a5ce845f1a1b620742e3c659e0a';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ ALTER ROLE regress_passwd_empty PASSWORD 'SCRAM-SHA-256$4096:hpFyHTUsSWcR7O9P$LgZFIt6Oqdo27ZFKbZ2nV+vtnYM995pDh9ca6WSi120=:qVV5NeluNfUPkwm7Vqat25RjSPLkGeoZBQs6wVv+um4=';
+-NOTICE:  empty string is not a valid password, clearing password
++ERROR:  role "regress_passwd_empty" does not exist
+ SELECT rolpassword FROM pg_authid WHERE rolname='regress_passwd_empty';
+  rolpassword 
+ -------------
+- 
+-(1 row)
++(0 rows)
+ 
+ -- Test with invalid stored and server keys.
+ --
+ -- The first is valid, to act as a control. The others have too long
+ -- stored/server keys. They will be re-hashed.
+ CREATE ROLE regress_passwd_sha_len0 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len1 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96RqwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZI=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ CREATE ROLE regress_passwd_sha_len2 PASSWORD 'SCRAM-SHA-256$4096:A6xHKoH/494E941doaPOYg==$Ky+A30sewHIH3VHQLRN9vYsuzlgNyGNKCh37dy96Rqw=:COPdlNiIkrsacU5QoxydEuOH6e/KfiipeETb/bPw8ZIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=';
++ERROR:  Received HTTP code 400 from control plane: {"error":"Neon only supports being given plaintext passwords"}
+ -- Check that the invalid secrets were re-hashed. A re-hashed secret
+ -- should not contain the original salt.
+ SELECT rolname, rolpassword not like '%A6xHKoH/494E941doaPOYg==%' as is_rolpassword_rehashed
+     FROM pg_authid
+     WHERE rolname LIKE 'regress_passwd_sha_len%'
+     ORDER BY rolname;
+-         rolname         | is_rolpassword_rehashed 
+--------------------------+-------------------------
+- regress_passwd_sha_len0 | f
+- regress_passwd_sha_len1 | t
+- regress_passwd_sha_len2 | t
+-(3 rows)
++ rolname | is_rolpassword_rehashed 
++---------+-------------------------
++(0 rows)
+ 
+ DROP ROLE regress_passwd1;
+ DROP ROLE regress_passwd2;
+ DROP ROLE regress_passwd3;
+ DROP ROLE regress_passwd4;
+ DROP ROLE regress_passwd5;
++ERROR:  role "regress_passwd5" does not exist
+ DROP ROLE regress_passwd6;
++ERROR:  role "regress_passwd6" does not exist
+ DROP ROLE regress_passwd7;
++ERROR:  role "regress_passwd7" does not exist
+ DROP ROLE regress_passwd8;
++ERROR:  role "regress_passwd8" does not exist
+ DROP ROLE regress_passwd9;
+ DROP ROLE regress_passwd_empty;
++ERROR:  role "regress_passwd_empty" does not exist
+ DROP ROLE regress_passwd_sha_len0;
++ERROR:  role "regress_passwd_sha_len0" does not exist
+ DROP ROLE regress_passwd_sha_len1;
++ERROR:  role "regress_passwd_sha_len1" does not exist
+ DROP ROLE regress_passwd_sha_len2;
++ERROR:  role "regress_passwd_sha_len2" does not exist
+ -- all entries should have been removed
+ SELECT rolname, rolpassword
+     FROM pg_authid
+diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
+index 1296da0d57..f43fffa44c 100644
+--- a/src/test/regress/expected/privileges.out
++++ b/src/test/regress/expected/privileges.out
+@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
+ 
+ RESET client_min_messages;
+ -- test proper begins here
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
+ ERROR:  role "regress_priv_user5" already exists
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+ GRANT regress_priv_user1 TO regress_priv_user3 WITH ADMIN OPTION GRANTED BY regress_priv_user2;
+@@ -108,11 +108,11 @@ ERROR:  role "regress_priv_user5" cannot be dropped because some objects depend
+ DETAIL:  privileges for membership of role regress_priv_user6 in role regress_priv_user1
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+ GRANT pg_read_all_settings TO regress_priv_user8 WITH ADMIN OPTION;
+@@ -212,8 +212,8 @@ REVOKE pg_read_all_settings FROM regress_priv_user8;
+ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ ERROR:  permission denied to grant privileges as role "regress_priv_role"
+ DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
++ERROR:  permission denied to grant privileges as role "neondb_owner"
++DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY foo; -- error
+ ERROR:  role "foo" does not exist
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY regress_priv_user2; -- warning, noop
+ WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "regress_priv_user2"
+ REVOKE ADMIN OPTION FOR regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_USER;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ REVOKE regress_priv_role FROM regress_priv_user1 GRANTED BY CURRENT_ROLE;
++WARNING:  role "regress_priv_user1" has not been granted membership in role "regress_priv_role" by role "neondb_owner"
+ DROP ROLE regress_priv_role;
+ SET SESSION AUTHORIZATION regress_priv_user1;
+ SELECT session_user, current_user;
+@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Check that index expressions and predicates are run as the table's owner
+ -- A dummy index function checking current_user
+ CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
+@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
+ drop cascades to function testns.priv_testproc(integer)
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+ SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
+@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
+ DROP USER regress_priv_user8; -- does not exist
+ ERROR:  role "regress_priv_user8" does not exist
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ -- LOCK TABLE and SELECT permission
+ GRANT SELECT ON lock_table TO regress_locktable_user;
+@@ -2888,7 +2892,7 @@ DROP USER regress_locktable_user;
+ -- pg_backend_memory_contexts.
+ -- switch to superuser
+ \c -
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+  has_table_privilege 
+ ---------------------
+@@ -2932,10 +2936,10 @@ RESET ROLE;
+ -- clean up
+ DROP ROLE regress_readallstats;
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+ SET SESSION AUTHORIZATION regress_group_direct_manager;
+@@ -2964,9 +2968,9 @@ DROP ROLE regress_group_direct_manager;
+ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+@@ -2995,9 +2999,9 @@ DROP ROLE regress_roleoption_protagonist;
+ DROP ROLE regress_roleoption_donor;
+ DROP ROLE regress_roleoption_recipient;
+ -- MAINTAIN
+-CREATE ROLE regress_no_maintain;
+-CREATE ROLE regress_maintain;
+-CREATE ROLE regress_maintain_all IN ROLE pg_maintain;
++CREATE ROLE regress_no_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_maintain_all IN ROLE pg_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE maintain_test (a INT);
+ CREATE INDEX ON maintain_test (a);
+ GRANT MAINTAIN ON maintain_test TO regress_maintain;
+diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
+index 3bbe4c5f97..e742a46a63 100644
+--- a/src/test/regress/expected/psql.out
++++ b/src/test/regress/expected/psql.out
+@@ -2862,7 +2862,7 @@ Type                | func
+ -- check conditional am display
+ \pset expanded off
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -4817,7 +4817,7 @@ last error code: 22012
+ reset debug_parallel_query;
+ \unset FETCH_COUNT
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ alter schema testpart owner to regress_partitioning_role;
+ set role to regress_partitioning_role;
+ -- run test inside own schema and hide other partitions
+@@ -5269,7 +5269,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ create function psql_df_internal (float8)
+@@ -5557,11 +5557,14 @@ CREATE TEMPORARY TABLE reload_output(
+   line text
+ );
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+   line   
+ ---------
+@@ -5600,13 +5603,15 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5623,7 +5628,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (10 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+    line   
+ ----------
+@@ -5660,7 +5666,8 @@ COPY (SELECT 'foo1') TO STDOUT \; COPY (SELECT 'bar1') TO STDOUT;
+ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -5669,7 +5676,8 @@ SELECT line FROM reload_output ORDER BY lineno;
+ (2 rows)
+ 
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+  line 
+ ------
+@@ -6633,10 +6641,10 @@ cross-database references are not implemented: "no.such.database"."no.such.schem
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ cross-database references are not implemented: "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role2 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/expected/publication.out b/src/test/regress/expected/publication.out
+index 30b6371134..cc01076c22 100644
+--- a/src/test/regress/expected/publication.out
++++ b/src/test/regress/expected/publication.out
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ -- suppress warning that depends on wal_level
+ SET client_min_messages = 'ERROR';
+@@ -1221,7 +1221,7 @@ ALTER PUBLICATION testpub2 ADD TABLE testpub_tbl1;  -- ok
+ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/expected/regproc.out b/src/test/regress/expected/regproc.out
+index 97b917502c..e9428535cb 100644
+--- a/src/test/regress/expected/regproc.out
++++ b/src/test/regress/expected/regproc.out
+@@ -2,7 +2,7 @@
+ -- regproc
+ --
+ /* If objects exist, return oids */
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- without schemaname
+ SELECT regoper('||/');
+  regoper 
+diff --git a/src/test/regress/expected/roleattributes.out b/src/test/regress/expected/roleattributes.out
+index 5e6969b173..2c4d52237f 100644
+--- a/src/test/regress/expected/roleattributes.out
++++ b/src/test/regress/expected/roleattributes.out
+@@ -1,233 +1,233 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_superuser | t        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+-       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++       rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_inherit | f        | f          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++           rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+-         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++         rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createrole | f        | t          | t             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++          rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+-        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++        rolname        | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++-----------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_createdb | f        | t          | f             | t           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_role_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++            rolname             | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | t           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_user_canlogin | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++           rolname            | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+-         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+---------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++         rolname          | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++--------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_replication | f        | t          | f             | f           | f           | t              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++          rolname           | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++----------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_def_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | f            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+-        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit | rolpassword | rolvaliduntil 
+-------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+-------------+---------------
+- regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 |             | 
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++        rolname         | rolsuper | rolinherit | rolcreaterole | rolcreatedb | rolcanlogin | rolreplication | rolbypassrls | rolconnlimit |                  regexp_replace                   | rolvaliduntil 
++------------------------+----------+------------+---------------+-------------+-------------+----------------+--------------+--------------+---------------------------------------------------+---------------
++ regress_test_bypassrls | f        | t          | f             | f           | f           | f              | t            |           -1 | SCRAM-SHA-256$4096:<salt>$<storedkey>:<serverkey> | 
+ (1 row)
+ 
+ -- clean up roles
+diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
+index 51bba175ec..45355a9c66 100644
+--- a/src/test/regress/expected/rowsecurity.out
++++ b/src/test/regress/expected/rowsecurity.out
+@@ -14,13 +14,13 @@ DROP ROLE IF EXISTS regress_rls_group2;
+ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+ CREATE SCHEMA regress_rls_schema;
+@@ -4423,8 +4423,8 @@ SELECT count(*) = 0 FROM pg_depend
+ 
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+ CREATE POLICY p1 ON dob_t1 TO regress_rls_dob_role1 USING (true);
+diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
+index 13178e2b3d..9a3ebfea3c 100644
+--- a/src/test/regress/expected/rules.out
++++ b/src/test/regress/expected/rules.out
+@@ -3799,7 +3799,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+ CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS
+diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out
+index a8e01a6220..83543b250a 100644
+--- a/src/test/regress/expected/security_label.out
++++ b/src/test/regress/expected/security_label.out
+@@ -6,8 +6,8 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_seclabel_user1;
+ DROP ROLE IF EXISTS regress_seclabel_user2;
+ RESET client_min_messages;
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+ CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2;
+diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out
+index b79fe9a1c0..e29fab88ab 100644
+--- a/src/test/regress/expected/select_into.out
++++ b/src/test/regress/expected/select_into.out
+@@ -15,7 +15,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
+index 496ddb1289..a4fea8e367 100644
+--- a/src/test/regress/expected/select_parallel.out
++++ b/src/test/regress/expected/select_parallel.out
+@@ -1295,7 +1295,7 @@ SELECT 1 FROM tenk1_vw_sec
+ 
+ rollback;
+ -- test that function option SET ROLE works in parallel workers.
+-create role regress_parallel_worker;
++create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ create function set_and_report_role() returns text as
+   $$ select current_setting('role') $$ language sql parallel safe
+   set role = regress_parallel_worker;
+diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
+index 1aeed8452b..7d9427d070 100644
+--- a/src/test/regress/expected/select_views.out
++++ b/src/test/regress/expected/select_views.out
+@@ -1250,7 +1250,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+        AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
+diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
+index fa8059dbcd..190d41afc7 100644
+--- a/src/test/regress/expected/sequence.out
++++ b/src/test/regress/expected/sequence.out
+@@ -22,7 +22,7 @@ CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid;  -- not a table
+ ERROR:  sequence cannot be owned by relation "pg_class_oid_index"
+ DETAIL:  This operation is not supported for indexes.
+ CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname;  -- not same schema
+-ERROR:  sequence must be in same schema as table it is linked to
++ERROR:  sequence must have same owner as table it is linked to
+ CREATE TABLE sequence_test_table (a int);
+ CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b;  -- wrong column
+ ERROR:  column "b" of relation "sequence_test_table" does not exist
+@@ -640,7 +640,7 @@ SELECT setval('sequence_test2', 1);  -- error
+ ERROR:  cannot execute setval() in a read-only transaction
+ ROLLBACK;
+ -- privileges tests
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- nextval
+ BEGIN;
+ SET LOCAL SESSION AUTHORIZATION regress_seq_user;
+diff --git a/src/test/regress/expected/stats.out b/src/test/regress/expected/stats.out
+index 6e08898b18..7eb5385b7a 100644
+--- a/src/test/regress/expected/stats.out
++++ b/src/test/regress/expected/stats.out
+@@ -1301,37 +1301,6 @@ SELECT current_setting('fsync') = 'off'
+  t
+ (1 row)
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+- count 
+--------
+-   100
+-(1 row)
+-
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+- pg_stat_force_next_flush 
+---------------------------
+- 
+-(1 row)
+-
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+- ?column? 
+-----------
+- t
+-(1 row)
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+@@ -1433,6 +1402,7 @@ SELECT :io_sum_local_after_evictions > :io_sum_local_before_evictions,
+ -- local buffers, exercising a different codepath than standard local buffer
+ -- writes.
+ ALTER TABLE test_io_local SET TABLESPACE regress_tblspace;
++ERROR:  tablespace "regress_tblspace" does not exist
+ SELECT pg_stat_force_next_flush();
+  pg_stat_force_next_flush 
+ --------------------------
+@@ -1444,7 +1414,7 @@ SELECT sum(writes) AS io_sum_local_new_tblspc_writes
+ SELECT :io_sum_local_new_tblspc_writes > :io_sum_local_after_writes;
+  ?column? 
+ ----------
+- t
++ f
+ (1 row)
+ 
+ RESET temp_buffers;
+diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
+index 8c4da95508..346961f92a 100644
+--- a/src/test/regress/expected/stats_ext.out
++++ b/src/test/regress/expected/stats_ext.out
+@@ -70,7 +70,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ ERROR:  must be owner of statistics object ab1_a_b_stats
+@@ -3214,7 +3214,7 @@ set search_path to public, stts_s1;
+  stts_s1 | stts_foo               | col1, col2 FROM stts_t3                                          | defined   | defined      | defined
+ (10 rows)
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+                                                        List of extended statistics
+@@ -3237,7 +3237,7 @@ drop schema stts_s1, stts_s2 cascade;
+ drop user regress_stats_ext;
+ reset search_path;
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out
+index 0f2a25cdc1..de168e39d9 100644
+--- a/src/test/regress/expected/subscription.out
++++ b/src/test/regress/expected/subscription.out
+@@ -1,10 +1,10 @@
+ --
+ -- SUBSCRIPTION
+ --
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ -- fail - no publications
+ CREATE SUBSCRIPTION regress_testsub CONNECTION 'foo';
+diff --git a/src/test/regress/expected/test_setup.out b/src/test/regress/expected/test_setup.out
+index 3d0eeec996..2c3932139d 100644
+--- a/src/test/regress/expected/test_setup.out
++++ b/src/test/regress/expected/test_setup.out
+@@ -21,6 +21,7 @@ GRANT ALL ON SCHEMA public TO public;
+ -- Create a tablespace we can use in tests.
+ SET allow_in_place_tablespaces = true;
+ CREATE TABLESPACE regress_tblspace LOCATION '';
++ERROR:  CREATE TABLESPACE is not supported on Neon
+ --
+ -- These tables have traditionally been referenced by many tests,
+ -- so create and populate them.  Insert only non-error values here.
+@@ -111,7 +112,8 @@ CREATE TABLE onek (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+ VACUUM ANALYZE onek2;
+@@ -134,7 +136,8 @@ CREATE TABLE tenk1 (
+ 	string4		name
+ );
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+ VACUUM ANALYZE tenk2;
+@@ -144,20 +147,23 @@ CREATE TABLE person (
+ 	location 	point
+ );
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ CREATE TABLE emp (
+ 	salary 		int4,
+ 	manager 	name
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ CREATE TABLE student (
+ 	gpa 		float8
+ ) INHERITS (person);
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ CREATE TABLE stud_emp (
+ 	percent 	int4
+@@ -166,14 +172,16 @@ NOTICE:  merging multiple inherited definitions of column "name"
+ NOTICE:  merging multiple inherited definitions of column "age"
+ NOTICE:  merging multiple inherited definitions of column "location"
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ CREATE TABLE road (
+ 	name		text,
+ 	thepath 	path
+ );
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ CREATE TABLE ihighway () INHERITS (road);
+ INSERT INTO ihighway
+diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
+index 9fad6c8b04..a1b8e82389 100644
+--- a/src/test/regress/expected/tsearch.out
++++ b/src/test/regress/expected/tsearch.out
+@@ -63,7 +63,8 @@ CREATE TABLE test_tsvector(
+ 	a tsvector
+ );
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ ANALYZE test_tsvector;
+ -- test basic text search behavior without indexes, then with
+ SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
+diff --git a/src/test/regress/expected/updatable_views.out b/src/test/regress/expected/updatable_views.out
+index 442b55120c..7224709d6f 100644
+--- a/src/test/regress/expected/updatable_views.out
++++ b/src/test/regress/expected/updatable_views.out
+@@ -1338,9 +1338,9 @@ NOTICE:  drop cascades to 2 other objects
+ DETAIL:  drop cascades to view rw_view1
+ drop cascades to function rw_view1_aa(rw_view1)
+ -- permissions checks
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+ INSERT INTO base_tbl VALUES (1, 'Row 1', 1.0);
+@@ -3734,8 +3734,8 @@ DETAIL:  View columns that are not columns of their base relation are not updata
+ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+ insert into base_tbl values (1,'xxx',1.0);
+diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
+index 1b27d132d7..25b109d609 100644
+--- a/src/test/regress/expected/update.out
++++ b/src/test/regress/expected/update.out
+@@ -608,7 +608,7 @@ DROP FUNCTION func_parted_mod_b();
+ -- RLS policies with update-row-movement
+ -----------------------------------------
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out
+index 2eba712887..d46877aca9 100644
+--- a/src/test/regress/expected/vacuum.out
++++ b/src/test/regress/expected/vacuum.out
+@@ -433,7 +433,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
+diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
+index f53a526f7c..c07b093476 100644
+--- a/src/test/regress/parallel_schedule
++++ b/src/test/regress/parallel_schedule
+@@ -135,4 +135,4 @@ test: fast_default
+ 
+ # run tablespace test at the end because it drops the tablespace created during
+ # setup that other tests may use.
+-test: tablespace
++#test: tablespace
+diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
+index 1a18ca3d8f..b2009628d0 100644
+--- a/src/test/regress/sql/aggregates.sql
++++ b/src/test/regress/sql/aggregates.sql
+@@ -15,7 +15,8 @@ CREATE TABLE aggtest (
+ );
+ 
+ \set filename :abs_srcdir '/data/agg.data'
+-COPY aggtest FROM :'filename';
++\set command '\\copy aggtest FROM ' :'filename';
++:command
+ 
+ ANALYZE aggtest;
+ 
+diff --git a/src/test/regress/sql/alter_generic.sql b/src/test/regress/sql/alter_generic.sql
+index de58d268d3..9d38df7f42 100644
+--- a/src/test/regress/sql/alter_generic.sql
++++ b/src/test/regress/sql/alter_generic.sql
+@@ -22,9 +22,9 @@ DROP ROLE IF EXISTS regress_alter_generic_user3;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_generic_user3;
+-CREATE USER regress_alter_generic_user2;
+-CREATE USER regress_alter_generic_user1 IN ROLE regress_alter_generic_user3;
++CREATE USER regress_alter_generic_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_alter_generic_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE regress_alter_generic_user3;
+ 
+ CREATE SCHEMA alt_nsp1;
+ CREATE SCHEMA alt_nsp2;
+@@ -316,7 +316,7 @@ DROP OPERATOR FAMILY alt_opf4 USING btree;
+ 
+ -- Should fail. Need to be SUPERUSER to do ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user5 NOSUPERUSER;
++CREATE ROLE regress_alter_generic_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER NOSUPERUSER;
+ CREATE OPERATOR FAMILY alt_opf5 USING btree;
+ SET ROLE regress_alter_generic_user5;
+ ALTER OPERATOR FAMILY alt_opf5 USING btree ADD OPERATOR 1 < (int4, int2), FUNCTION 1 btint42cmp(int4, int2);
+@@ -326,7 +326,7 @@ ROLLBACK;
+ 
+ -- Should fail. Need rights to namespace for ALTER OPERATOR FAMILY .. ADD / DROP
+ BEGIN TRANSACTION;
+-CREATE ROLE regress_alter_generic_user6;
++CREATE ROLE regress_alter_generic_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA alt_nsp6;
+ REVOKE ALL ON SCHEMA alt_nsp6 FROM regress_alter_generic_user6;
+ CREATE OPERATOR FAMILY alt_nsp6.alt_opf6 USING btree;
+diff --git a/src/test/regress/sql/alter_operator.sql b/src/test/regress/sql/alter_operator.sql
+index 8faecf7830..bb8b8e14ea 100644
+--- a/src/test/regress/sql/alter_operator.sql
++++ b/src/test/regress/sql/alter_operator.sql
+@@ -83,7 +83,7 @@ ALTER OPERATOR & (bit, bit) SET ("Restrict" = _int_contsel, "Join" = _int_contjo
+ --
+ -- Test permission check. Must be owner to ALTER OPERATOR.
+ --
+-CREATE USER regress_alter_op_user;
++CREATE USER regress_alter_op_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_alter_op_user;
+ 
+ ALTER OPERATOR === (boolean, boolean) SET (RESTRICT = NONE);
+diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
+index da12724473..86f5ae5444 100644
+--- a/src/test/regress/sql/alter_table.sql
++++ b/src/test/regress/sql/alter_table.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_alter_table_user1;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_alter_table_user1;
++CREATE USER regress_alter_table_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ --
+ -- add attribute
+@@ -2404,8 +2404,8 @@ DROP TABLE fail_part;
+ ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1);
+ 
+ -- check ownership of the source table
+-CREATE ROLE regress_test_me;
+-CREATE ROLE regress_test_not_me;
++CREATE ROLE regress_test_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_not_me PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE not_owned_by_me (LIKE list_parted);
+ ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me;
+ SET SESSION AUTHORIZATION regress_test_me;
+diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql
+index 47058dfde5..f8962592e4 100644
+--- a/src/test/regress/sql/arrays.sql
++++ b/src/test/regress/sql/arrays.sql
+@@ -22,7 +22,8 @@ CREATE TABLE array_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_op_test FROM :'filename';
++\set command '\\copy array_op_test FROM ' :'filename';
++:command
+ ANALYZE array_op_test;
+ 
+ --
+diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
+index 0d2a33f370..df86e6b050 100644
+--- a/src/test/regress/sql/btree_index.sql
++++ b/src/test/regress/sql/btree_index.sql
+@@ -26,16 +26,20 @@ CREATE TABLE bt_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_i4_heap FROM :'filename';
++\set command '\\copy bt_i4_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_name_heap FROM :'filename';
++\set command '\\copy bt_name_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY bt_txt_heap FROM :'filename';
++\set command '\\copy bt_txt_heap FROM ' :'filename';
++:command
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY bt_f8_heap FROM :'filename';
++\set command '\\copy bt_f8_heap FROM ' :'filename';
++:command
+ 
+ ANALYZE bt_i4_heap;
+ ANALYZE bt_name_heap;
+diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql
+index b7115f8610..a753f2c794 100644
+--- a/src/test/regress/sql/cluster.sql
++++ b/src/test/regress/sql/cluster.sql
+@@ -108,7 +108,7 @@ WHERE pg_class.oid=indexrelid
+ CLUSTER pg_toast.pg_toast_826 USING pg_toast_826_index;
+ 
+ -- Verify that clustering all tables does in fact cluster the right ones
+-CREATE USER regress_clstr_user;
++CREATE USER regress_clstr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE clstr_1 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_2 (a INT PRIMARY KEY);
+ CREATE TABLE clstr_3 (a INT PRIMARY KEY);
+@@ -235,7 +235,7 @@ DROP TABLE clstrpart;
+ CREATE TABLE ptnowner(i int unique) PARTITION BY LIST (i);
+ CREATE INDEX ptnowner_i_idx ON ptnowner(i);
+ CREATE TABLE ptnowner1 PARTITION OF ptnowner FOR VALUES IN (1);
+-CREATE ROLE regress_ptnowner;
++CREATE ROLE regress_ptnowner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE ptnowner2 PARTITION OF ptnowner FOR VALUES IN (2);
+ ALTER TABLE ptnowner1 OWNER TO regress_ptnowner;
+ SET SESSION AUTHORIZATION regress_ptnowner;
+diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
+index 4eb1adf028..28636ec711 100644
+--- a/src/test/regress/sql/collate.icu.utf8.sql
++++ b/src/test/regress/sql/collate.icu.utf8.sql
+@@ -353,7 +353,7 @@ reset enable_seqscan;
+ 
+ -- schema manipulation commands
+ 
+-CREATE ROLE regress_test_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA test_schema;
+ 
+ -- We need to do this this way to cope with varying names for encodings:
+diff --git a/src/test/regress/sql/constraints.sql b/src/test/regress/sql/constraints.sql
+index e3e3bea709..fa86ddc326 100644
+--- a/src/test/regress/sql/constraints.sql
++++ b/src/test/regress/sql/constraints.sql
+@@ -243,12 +243,14 @@ CREATE TABLE COPY_TBL (x INT, y TEXT, z INT,
+ 	CHECK (x > 3 AND y <> 'check failed' AND x < 7 ));
+ 
+ \set filename :abs_srcdir '/data/constro.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+ \set filename :abs_srcdir '/data/constrf.data'
+-COPY COPY_TBL FROM :'filename';
++\set command '\\copy COPY_TBL FROM ' :'filename';
++:command
+ 
+ SELECT * FROM COPY_TBL;
+ 
+@@ -599,7 +601,7 @@ DROP TABLE deferred_excl;
+ 
+ -- Comments
+ -- Setup a low-level role to enforce non-superuser checks.
+-CREATE ROLE regress_constraint_comments;
++CREATE ROLE regress_constraint_comments PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments;
+ 
+ CREATE TABLE constraint_comments_tbl (a int CONSTRAINT the_constraint CHECK (a > 0));
+@@ -621,7 +623,7 @@ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS NULL;
+ 
+ -- unauthorized user
+ RESET SESSION AUTHORIZATION;
+-CREATE ROLE regress_constraint_comments_noaccess;
++CREATE ROLE regress_constraint_comments_noaccess PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_constraint_comments_noaccess;
+ COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
+ COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
+diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
+index 9a65fca91f..58431a3056 100644
+--- a/src/test/regress/sql/conversion.sql
++++ b/src/test/regress/sql/conversion.sql
+@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+     AS :'regresslib', 'test_enc_conversion'
+     LANGUAGE C STRICT;
+ 
+-CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
++CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_conversion_user;
+ CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
+ --
+diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql
+index e2dd24cb35..4a186750f8 100644
+--- a/src/test/regress/sql/copy.sql
++++ b/src/test/regress/sql/copy.sql
+@@ -20,11 +20,13 @@ insert into copytest values('Mac',E'abc\rdef',3);
+ insert into copytest values(E'esc\\ape',E'a\\r\\\r\\\n\\nb',4);
+ 
+ \set filename :abs_builddir '/results/copytest.csv'
+-copy copytest to :'filename' csv;
++\set command '\\copy copytest to ' :'filename' csv;
++:command
+ 
+ create temp table copytest2 (like copytest);
+ 
+-copy copytest2 from :'filename' csv;
++\set command '\\copy copytest2 from ' :'filename' csv;
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -32,9 +34,11 @@ truncate copytest2;
+ 
+ --- same test but with an escape char different from quote char
+ 
+-copy copytest to :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest to ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+-copy copytest2 from :'filename' csv quote '''' escape E'\\';
++\set command '\\copy copytest2 from ' :'filename' ' csv quote ' '\'\'\'\'' ' escape ' 'E\'' '\\\\\'';
++:command
+ 
+ select * from copytest except select * from copytest2;
+ 
+@@ -86,16 +90,19 @@ insert into parted_copytest select x,2,'Two' from generate_series(1001,1010) x;
+ insert into parted_copytest select x,1,'One' from generate_series(1011,1020) x;
+ 
+ \set filename :abs_builddir '/results/parted_copytest.csv'
+-copy (select * from parted_copytest order by a) to :'filename';
++\set command '\\copy (select * from parted_copytest order by a) to ' :'filename';
++:command
+ 
+ truncate parted_copytest;
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ -- Ensure COPY FREEZE errors for partitioned tables.
+ begin;
+ truncate parted_copytest;
+-copy parted_copytest from :'filename' (freeze);
++\set command '\\copy parted_copytest from ' :'filename' (freeze);
++:command
+ rollback;
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+@@ -115,7 +122,8 @@ create trigger part_ins_trig
+ 	for each row
+ 	execute procedure part_ins_func();
+ 
+-copy parted_copytest from :'filename';
++\set command '\\copy parted_copytest from ' :'filename';
++:command
+ 
+ select tableoid::regclass,count(*),sum(a) from parted_copytest
+ group by tableoid order by tableoid::regclass::name;
+@@ -124,7 +132,8 @@ truncate table parted_copytest;
+ create index on parted_copytest (b);
+ drop trigger part_ins_trig on parted_copytest_a2;
+ 
+-copy parted_copytest from stdin;
++\set command '\\copy parted_copytest from ' stdin;
++:command
+ 1	1	str1
+ 2	2	str2
+ \.
+@@ -191,8 +200,8 @@ bill	20	(11,10)	1000	sharon
+ -- Generate COPY FROM report with FILE, with some excluded tuples.
+ truncate tab_progress_reporting;
+ \set filename :abs_srcdir '/data/emp.data'
+-copy tab_progress_reporting from :'filename'
+-	where (salary < 2000);
++\set command '\\copy tab_progress_reporting from ' :'filename' 'where (salary < 2000)';
++:command
+ 
+ drop trigger check_after_tab_progress_reporting on tab_progress_reporting;
+ drop function notice_after_tab_progress_reporting();
+@@ -311,7 +320,8 @@ CREATE TABLE parted_si_p_odd PARTITION OF parted_si FOR VALUES IN (1);
+ -- https://postgr.es/m/18130-7a86a7356a75209d%40postgresql.org
+ -- https://postgr.es/m/257696.1695670946%40sss.pgh.pa.us
+ \set filename :abs_srcdir '/data/desc.data'
+-COPY parted_si(id, data) FROM :'filename';
++\set command '\\COPY parted_si(id, data) FROM ' :'filename';
++:command
+ 
+ -- An earlier bug (see commit b1ecb9b3fcf) could end up using a buffer from
+ -- the wrong partition. This test is *not* guaranteed to trigger that bug, but
+diff --git a/src/test/regress/sql/copy2.sql b/src/test/regress/sql/copy2.sql
+index 6b75b6c7ea..f3655b413c 100644
+--- a/src/test/regress/sql/copy2.sql
++++ b/src/test/regress/sql/copy2.sql
+@@ -407,8 +407,8 @@ copy check_con_tbl from stdin;
+ select * from check_con_tbl;
+ 
+ -- test with RLS enabled.
+-CREATE ROLE regress_rls_copy_user;
+-CREATE ROLE regress_rls_copy_user_colperms;
++CREATE ROLE regress_rls_copy_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_copy_user_colperms PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE rls_t1 (a int, b int, c int);
+ 
+ COPY rls_t1 (a, b, c) from stdin;
+diff --git a/src/test/regress/sql/create_function_sql.sql b/src/test/regress/sql/create_function_sql.sql
+index 89e9af3a49..2b86fe2285 100644
+--- a/src/test/regress/sql/create_function_sql.sql
++++ b/src/test/regress/sql/create_function_sql.sql
+@@ -6,7 +6,7 @@
+ 
+ -- All objects made in this test are in temp_func_test schema
+ 
+-CREATE USER regress_unpriv_user;
++CREATE USER regress_unpriv_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE SCHEMA temp_func_test;
+ GRANT ALL ON SCHEMA temp_func_test TO public;
+diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
+index e296891cab..70cea565e4 100644
+--- a/src/test/regress/sql/create_index.sql
++++ b/src/test/regress/sql/create_index.sql
+@@ -71,7 +71,8 @@ CREATE TABLE fast_emp4000 (
+ );
+ 
+ \set filename :abs_srcdir '/data/rect.data'
+-COPY slow_emp4000 FROM :'filename';
++\set command '\\copy slow_emp4000 FROM ' :'filename';
++:command
+ 
+ INSERT INTO fast_emp4000 SELECT * FROM slow_emp4000;
+ 
+@@ -269,7 +270,8 @@ CREATE TABLE array_index_op_test (
+ );
+ 
+ \set filename :abs_srcdir '/data/array.data'
+-COPY array_index_op_test FROM :'filename';
++\set command '\\copy array_index_op_test FROM ' :'filename';
++:command
+ ANALYZE array_index_op_test;
+ 
+ SELECT * FROM array_index_op_test WHERE i = '{NULL}' ORDER BY seqno;
+@@ -1298,7 +1300,7 @@ END;
+ REINDEX SCHEMA CONCURRENTLY schema_to_reindex;
+ 
+ -- Failure for unauthorized user
+-CREATE ROLE regress_reindexuser NOLOGIN;
++CREATE ROLE regress_reindexuser NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION ROLE regress_reindexuser;
+ REINDEX SCHEMA schema_to_reindex;
+ -- Permission failures with toast tables and indexes (pg_authid here)
+diff --git a/src/test/regress/sql/create_procedure.sql b/src/test/regress/sql/create_procedure.sql
+index 069a3727ce..faeeb3f744 100644
+--- a/src/test/regress/sql/create_procedure.sql
++++ b/src/test/regress/sql/create_procedure.sql
+@@ -255,7 +255,7 @@ DROP PROCEDURE nonexistent();
+ 
+ -- privileges
+ 
+-CREATE USER regress_cp_user1;
++CREATE USER regress_cp_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT INSERT ON cp_test TO regress_cp_user1;
+ REVOKE EXECUTE ON PROCEDURE ptest1(text) FROM PUBLIC;
+ SET ROLE regress_cp_user1;
+diff --git a/src/test/regress/sql/create_role.sql b/src/test/regress/sql/create_role.sql
+index 4491a28a8a..3045434865 100644
+--- a/src/test/regress/sql/create_role.sql
++++ b/src/test/regress/sql/create_role.sql
+@@ -1,20 +1,20 @@
+ -- ok, superuser can create users with any set of privileges
+-CREATE ROLE regress_role_super SUPERUSER;
+-CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS;
++CREATE ROLE regress_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_admin CREATEDB CREATEROLE REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_role_admin WITH GRANT OPTION;
+-CREATE ROLE regress_role_limited_admin CREATEROLE;
+-CREATE ROLE regress_role_normal;
++CREATE ROLE regress_role_limited_admin CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_role_normal PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, CREATEROLE user can't give away role attributes without having them
+ SET SESSION AUTHORIZATION regress_role_limited_admin;
+-CREATE ROLE regress_nosuch_superuser SUPERUSER;
+-CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_nosuch_replication REPLICATION;
+-CREATE ROLE regress_nosuch_bypassrls BYPASSRLS;
+-CREATE ROLE regress_nosuch_createdb CREATEDB;
++CREATE ROLE regress_nosuch_superuser SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_nosuch_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can create a role without any special attributes
+-CREATE ROLE regress_role_limited;
++CREATE ROLE regress_role_limited PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, can't give it in any of the restricted attributes
+ ALTER ROLE regress_role_limited SUPERUSER;
+@@ -25,10 +25,10 @@ DROP ROLE regress_role_limited;
+ 
+ -- ok, can give away these role attributes if you have them
+ SET SESSION AUTHORIZATION regress_role_admin;
+-CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS;
+-CREATE ROLE regress_replication REPLICATION;
+-CREATE ROLE regress_bypassrls BYPASSRLS;
+-CREATE ROLE regress_createdb CREATEDB;
++CREATE ROLE regress_replication_bypassrls REPLICATION BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_replication REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_bypassrls BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_createdb CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can toggle these role attributes off and on if you have them
+ ALTER ROLE regress_replication NOREPLICATION;
+@@ -43,52 +43,52 @@ ALTER ROLE regress_createdb SUPERUSER;
+ ALTER ROLE regress_createdb NOSUPERUSER;
+ 
+ -- ok, having CREATEROLE is enough to create users with these privileges
+-CREATE ROLE regress_createrole CREATEROLE NOINHERIT;
++CREATE ROLE regress_createrole CREATEROLE NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_createrole WITH GRANT OPTION;
+-CREATE ROLE regress_login LOGIN;
+-CREATE ROLE regress_inherit INHERIT;
+-CREATE ROLE regress_connection_limit CONNECTION LIMIT 5;
+-CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD 'foo';
+-CREATE ROLE regress_password_null PASSWORD NULL;
++CREATE ROLE regress_login LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_inherit INHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_connection_limit CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_encrypted_password ENCRYPTED PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, backwards compatible noise words should be ignored
+-CREATE ROLE regress_noiseword SYSID 12345;
++CREATE ROLE regress_noiseword SYSID 12345 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant membership in superuser role
+-CREATE ROLE regress_nosuch_super IN ROLE regress_role_super;
++CREATE ROLE regress_nosuch_super IN ROLE regress_role_super PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, database owner cannot have members
+-CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner;
++CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role
+ CREATE ROLE regress_inroles ROLE
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself
+-CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive;
++CREATE ROLE regress_nosuch_recursive ROLE regress_nosuch_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, can grant other users into a role with admin option
+ CREATE ROLE regress_adminroles ADMIN
+ 	regress_role_super, regress_createdb, regress_createrole, regress_login,
+-	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null;
++	regress_inherit, regress_connection_limit, regress_encrypted_password, regress_password_null PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, cannot grant a role into itself with admin option
+-CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive;
++CREATE ROLE regress_nosuch_admin_recursive ADMIN regress_nosuch_admin_recursive PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- fail, regress_createrole does not have CREATEDB privilege
+ SET SESSION AUTHORIZATION regress_createrole;
+ CREATE DATABASE regress_nosuch_db;
+ 
+ -- ok, regress_createrole can create new roles
+-CREATE ROLE regress_plainrole;
++CREATE ROLE regress_plainrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with it
+-CREATE ROLE regress_rolecreator CREATEROLE;
++CREATE ROLE regress_rolecreator CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, roles with CREATEROLE can create new roles with different role
+ -- attributes, including CREATEROLE
+-CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5;
++CREATE ROLE regress_hasprivs CREATEROLE LOGIN INHERIT CONNECTION LIMIT 5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- ok, we should be able to modify a role we created
+ COMMENT ON ROLE regress_hasprivs IS 'some comment';
+@@ -123,7 +123,7 @@ REASSIGN OWNED BY regress_tenant TO regress_createrole;
+ 
+ -- ok, create a role with a value for createrole_self_grant
+ SET createrole_self_grant = 'set, inherit';
+-CREATE ROLE regress_tenant2;
++CREATE ROLE regress_tenant2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT CREATE ON DATABASE regression TO regress_tenant2;
+ 
+ -- ok, regress_tenant2 can create objects within the database
+@@ -150,16 +150,16 @@ ALTER TABLE tenant2_table OWNER TO regress_tenant2;
+ DROP TABLE tenant2_table;
+ 
+ -- fail, CREATEROLE is not enough to create roles in privileged roles
+-CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data;
+-CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data;
+-CREATE ROLE regress_monitor IN ROLE pg_monitor;
+-CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings;
+-CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats;
+-CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables;
+-CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files;
+-CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files;
+-CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program;
+-CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend;
++CREATE ROLE regress_read_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_data;
++CREATE ROLE regress_write_all_data PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_all_data;
++CREATE ROLE regress_monitor PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_monitor;
++CREATE ROLE regress_read_all_settings PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_settings;
++CREATE ROLE regress_read_all_stats PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_all_stats;
++CREATE ROLE regress_stat_scan_tables PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_stat_scan_tables;
++CREATE ROLE regress_read_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_read_server_files;
++CREATE ROLE regress_write_server_files PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_write_server_files;
++CREATE ROLE regress_execute_server_program PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_execute_server_program;
++CREATE ROLE regress_signal_backend PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_signal_backend;
+ 
+ -- fail, role still owns database objects
+ DROP ROLE regress_tenant;
+diff --git a/src/test/regress/sql/create_schema.sql b/src/test/regress/sql/create_schema.sql
+index 1b7064247a..be5b662ce1 100644
+--- a/src/test/regress/sql/create_schema.sql
++++ b/src/test/regress/sql/create_schema.sql
+@@ -4,7 +4,7 @@
+ 
+ -- Schema creation with elements.
+ 
+-CREATE ROLE regress_create_schema_role SUPERUSER;
++CREATE ROLE regress_create_schema_role SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Cases where schema creation fails as objects are qualified with a schema
+ -- that does not match with what's expected.
+diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
+index ae6841308b..47bc792e30 100644
+--- a/src/test/regress/sql/create_view.sql
++++ b/src/test/regress/sql/create_view.sql
+@@ -23,7 +23,8 @@ CREATE TABLE real_city (
+ );
+ 
+ \set filename :abs_srcdir '/data/real_city.data'
+-COPY real_city FROM :'filename';
++\set command '\\copy real_city FROM ' :'filename';
++:command
+ ANALYZE real_city;
+ 
+ SELECT *
+diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
+index 0367c0e37a..a23b98c4bd 100644
+--- a/src/test/regress/sql/database.sql
++++ b/src/test/regress/sql/database.sql
+@@ -1,8 +1,6 @@
+ CREATE DATABASE regression_tbd
+ 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
+ ALTER DATABASE regression_tbd RENAME TO regression_utf8;
+-ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
+-ALTER DATABASE regression_utf8 RESET TABLESPACE;
+ ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
+ 
+ -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
+diff --git a/src/test/regress/sql/dependency.sql b/src/test/regress/sql/dependency.sql
+index 8d74ed7122..293194615e 100644
+--- a/src/test/regress/sql/dependency.sql
++++ b/src/test/regress/sql/dependency.sql
+@@ -2,10 +2,10 @@
+ -- DEPENDENCIES
+ --
+ 
+-CREATE USER regress_dep_user;
+-CREATE USER regress_dep_user2;
+-CREATE USER regress_dep_user3;
+-CREATE GROUP regress_dep_group;
++CREATE USER regress_dep_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_dep_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE deptest (f1 serial primary key, f2 text);
+ 
+@@ -45,9 +45,9 @@ DROP TABLE deptest;
+ DROP USER regress_dep_user3;
+ 
+ -- Test DROP OWNED
+-CREATE USER regress_dep_user0;
+-CREATE USER regress_dep_user1;
+-CREATE USER regress_dep_user2;
++CREATE USER regress_dep_user0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_dep_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_dep_user0;
+ -- permission denied
+ DROP OWNED BY regress_dep_user1;
+diff --git a/src/test/regress/sql/drop_if_exists.sql b/src/test/regress/sql/drop_if_exists.sql
+index ac6168b91f..4270062ec7 100644
+--- a/src/test/regress/sql/drop_if_exists.sql
++++ b/src/test/regress/sql/drop_if_exists.sql
+@@ -86,9 +86,9 @@ DROP DOMAIN test_domain_exists;
+ --- role/user/group
+ ---
+ 
+-CREATE USER regress_test_u1;
+-CREATE ROLE regress_test_r1;
+-CREATE GROUP regress_test_g1;
++CREATE USER regress_test_u1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_r1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_test_g1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP USER regress_test_u2;
+ 
+diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql
+index 247b0a3105..bf018fd3a1 100644
+--- a/src/test/regress/sql/equivclass.sql
++++ b/src/test/regress/sql/equivclass.sql
+@@ -230,7 +230,7 @@ set enable_mergejoin = off;
+ alter table ec1 enable row level security;
+ create policy p1 on ec1 using (f1 < '5'::int8alias1);
+ 
+-create user regress_user_ectest;
++create user regress_user_ectest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select on ec0 to regress_user_ectest;
+ grant select on ec1 to regress_user_ectest;
+ 
+diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql
+index 013546b830..616a46da1d 100644
+--- a/src/test/regress/sql/event_trigger.sql
++++ b/src/test/regress/sql/event_trigger.sql
+@@ -86,7 +86,7 @@ create event trigger regress_event_trigger2 on ddl_command_start
+ comment on event trigger regress_event_trigger is 'test comment';
+ 
+ -- drop as non-superuser should fail
+-create role regress_evt_user;
++create role regress_evt_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_evt_user;
+ create event trigger regress_event_trigger_noperms on ddl_command_start
+    execute procedure test_event_trigger();
+diff --git a/src/test/regress/sql/foreign_data.sql b/src/test/regress/sql/foreign_data.sql
+index aa147b14a9..370e0dd570 100644
+--- a/src/test/regress/sql/foreign_data.sql
++++ b/src/test/regress/sql/foreign_data.sql
+@@ -22,14 +22,14 @@ DROP ROLE IF EXISTS regress_foreign_data_user, regress_test_role, regress_test_r
+ 
+ RESET client_min_messages;
+ 
+-CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER;
++CREATE ROLE regress_foreign_data_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_foreign_data_user';
+ 
+-CREATE ROLE regress_test_role;
+-CREATE ROLE regress_test_role2;
+-CREATE ROLE regress_test_role_super SUPERUSER;
+-CREATE ROLE regress_test_indirect;
+-CREATE ROLE regress_unprivileged_role;
++CREATE ROLE regress_test_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_role_super SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_test_indirect PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_unprivileged_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FOREIGN DATA WRAPPER dummy;
+ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
+diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
+index 2e710e419c..89cd481a54 100644
+--- a/src/test/regress/sql/foreign_key.sql
++++ b/src/test/regress/sql/foreign_key.sql
+@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
+ DROP TABLE fk_partitioned_pk_6, fk_partitioned_fk_6;
+ 
+ -- test the case when the referenced table is owned by a different user
+-create role regress_other_partitioned_fk_owner;
++create role regress_other_partitioned_fk_owner PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant references on fk_notpartitioned_pk to regress_other_partitioned_fk_owner;
+ set role regress_other_partitioned_fk_owner;
+ create table other_partitioned_fk(a int, b int) partition by list (a);
+diff --git a/src/test/regress/sql/generated.sql b/src/test/regress/sql/generated.sql
+index cb55d77821..9c15ae954c 100644
+--- a/src/test/regress/sql/generated.sql
++++ b/src/test/regress/sql/generated.sql
+@@ -263,7 +263,7 @@ ALTER TABLE gtest10a DROP COLUMN b;
+ INSERT INTO gtest10a (a) VALUES (1);
+ 
+ -- privileges
+-CREATE USER regress_user11;
++CREATE USER regress_user11 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE gtest11s (a int PRIMARY KEY, b int, c int GENERATED ALWAYS AS (b * 2) STORED);
+ INSERT INTO gtest11s VALUES (1, 10), (2, 20);
+diff --git a/src/test/regress/sql/guc.sql b/src/test/regress/sql/guc.sql
+index dc79761955..a9ead75349 100644
+--- a/src/test/regress/sql/guc.sql
++++ b/src/test/regress/sql/guc.sql
+@@ -188,7 +188,7 @@ PREPARE foo AS SELECT 1;
+ LISTEN foo_event;
+ SET vacuum_cost_delay = 13;
+ CREATE TEMP TABLE tmp_foo (data text) ON COMMIT DELETE ROWS;
+-CREATE ROLE regress_guc_user;
++CREATE ROLE regress_guc_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_guc_user;
+ -- look changes
+ SELECT pg_listening_channels();
+diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql
+index 219da82981..bf99d2ec4c 100644
+--- a/src/test/regress/sql/hash_index.sql
++++ b/src/test/regress/sql/hash_index.sql
+@@ -26,10 +26,14 @@ CREATE TABLE hash_f8_heap (
+ );
+ 
+ \set filename :abs_srcdir '/data/hash.data'
+-COPY hash_i4_heap FROM :'filename';
+-COPY hash_name_heap FROM :'filename';
+-COPY hash_txt_heap FROM :'filename';
+-COPY hash_f8_heap FROM :'filename';
++\set command '\\copy hash_i4_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_name_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_txt_heap FROM ' :'filename';
++:command
++\set command '\\copy hash_f8_heap FROM ' :'filename';
++:command
+ 
+ -- the data in this file has a lot of duplicates in the index key
+ -- fields, leading to long bucket chains and lots of table expansion.
+diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
+index cb0e05a2f1..b11492bd31 100644
+--- a/src/test/regress/sql/identity.sql
++++ b/src/test/regress/sql/identity.sql
+@@ -287,7 +287,7 @@ ALTER TABLE itest7 ALTER COLUMN a RESTART;
+ ALTER TABLE itest7 ALTER COLUMN a DROP IDENTITY;
+ 
+ -- privileges
+-CREATE USER regress_identity_user1;
++CREATE USER regress_identity_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE itest8 (a int GENERATED ALWAYS AS IDENTITY, b text);
+ GRANT SELECT, INSERT ON itest8 TO regress_identity_user1;
+ SET ROLE regress_identity_user1;
+diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
+index 51251b0e51..3492f1cfef 100644
+--- a/src/test/regress/sql/inherit.sql
++++ b/src/test/regress/sql/inherit.sql
+@@ -770,8 +770,8 @@ drop table cnullparent cascade;
+ --
+ -- Mixed ownership inheritance tree
+ --
+-create role regress_alice;
+-create role regress_bob;
++create role regress_alice password NEON_PASSWORD_PLACEHOLDER;
++create role regress_bob password NEON_PASSWORD_PLACEHOLDER;
+ grant all on schema public to regress_alice, regress_bob;
+ grant regress_alice to regress_bob;
+ set session authorization regress_alice;
+@@ -1031,7 +1031,7 @@ create index on permtest_parent (left(c, 3));
+ insert into permtest_parent
+   select 1, 'a', left(fipshash(i::text), 5) from generate_series(0, 100) i;
+ analyze permtest_parent;
+-create role regress_no_child_access;
++create role regress_no_child_access PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ revoke all on permtest_grandchild from regress_no_child_access;
+ grant select on permtest_parent to regress_no_child_access;
+ set session authorization regress_no_child_access;
+diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
+index 2b086eeb6d..913d8a0aed 100644
+--- a/src/test/regress/sql/insert.sql
++++ b/src/test/regress/sql/insert.sql
+@@ -513,7 +513,7 @@ drop table mlparted5;
+ create table key_desc (a int, b int) partition by list ((a+0));
+ create table key_desc_1 partition of key_desc for values in (1) partition by range (b);
+ 
+-create user regress_insert_other_user;
++create user regress_insert_other_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant select (a) on key_desc_1 to regress_insert_other_user;
+ grant insert on key_desc to regress_insert_other_user;
+ 
+@@ -597,7 +597,7 @@ insert into brtrigpartcon1 values (1, 'hi there');
+ -- check that the message shows the appropriate column description in a
+ -- situation where the partitioned table is not the primary ModifyTable node
+ create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int);
+-create role regress_coldesc_role;
++create role regress_coldesc_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ grant insert on inserttest3 to regress_coldesc_role;
+ grant insert on brtrigpartcon to regress_coldesc_role;
+ revoke select on brtrigpartcon from regress_coldesc_role;
+diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
+index 97bc2242a1..88c8b1dcdb 100644
+--- a/src/test/regress/sql/jsonb.sql
++++ b/src/test/regress/sql/jsonb.sql
+@@ -6,7 +6,8 @@ CREATE TABLE testjsonb (
+ );
+ 
+ \set filename :abs_srcdir '/data/jsonb.data'
+-COPY testjsonb FROM :'filename';
++\set command '\\copy testjsonb FROM ' :'filename';
++:command
+ 
+ -- Strings.
+ SELECT '""'::jsonb;				-- OK.
+diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql
+index a4aee02e3a..8839c9496a 100644
+--- a/src/test/regress/sql/largeobject.sql
++++ b/src/test/regress/sql/largeobject.sql
+@@ -10,7 +10,7 @@
+ SET bytea_output TO escape;
+ 
+ -- Test ALTER LARGE OBJECT OWNER
+-CREATE ROLE regress_lo_user;
++CREATE ROLE regress_lo_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SELECT lo_create(42);
+ ALTER LARGE OBJECT 42 OWNER TO regress_lo_user;
+ 
+@@ -189,7 +189,8 @@ SELECT lo_unlink(loid) from lotest_stash_values;
+ TRUNCATE lotest_stash_values;
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-INSERT INTO lotest_stash_values (loid) SELECT lo_import(:'filename');
++\lo_import :filename
++INSERT INTO lotest_stash_values (loid) VALUES (:LASTOID);
+ 
+ BEGIN;
+ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS integer));
+@@ -219,8 +220,8 @@ SELECT lo_close(fd) FROM lotest_stash_values;
+ END;
+ 
+ \set filename :abs_builddir '/results/lotest.txt'
+-SELECT lo_export(loid, :'filename') FROM lotest_stash_values;
+-
++SELECT loid FROM lotest_stash_values \gset
++\lo_export :loid, :filename
+ \lo_import :filename
+ 
+ \set newloid :LASTOID
+diff --git a/src/test/regress/sql/lock.sql b/src/test/regress/sql/lock.sql
+index b88488c6d0..78b31e6dd3 100644
+--- a/src/test/regress/sql/lock.sql
++++ b/src/test/regress/sql/lock.sql
+@@ -19,7 +19,7 @@ CREATE VIEW lock_view3 AS SELECT * from lock_view2;
+ CREATE VIEW lock_view4 AS SELECT (select a from lock_tbl1a limit 1) from lock_tbl1;
+ CREATE VIEW lock_view5 AS SELECT * from lock_tbl1 where a in (select * from lock_tbl1a);
+ CREATE VIEW lock_view6 AS SELECT * from (select * from lock_tbl1) sub;
+-CREATE ROLE regress_rol_lock1;
++CREATE ROLE regress_rol_lock1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER ROLE regress_rol_lock1 SET search_path = lock_schema1;
+ GRANT USAGE ON SCHEMA lock_schema1 TO regress_rol_lock1;
+ 
+diff --git a/src/test/regress/sql/matview.sql b/src/test/regress/sql/matview.sql
+index b74ee305e0..33b8b690fc 100644
+--- a/src/test/regress/sql/matview.sql
++++ b/src/test/regress/sql/matview.sql
+@@ -209,7 +209,7 @@ SELECT * FROM mvtest_mv_v;
+ DROP TABLE mvtest_v CASCADE;
+ 
+ -- make sure running as superuser works when MV owned by another role (bug #11208)
+-CREATE ROLE regress_user_mvtest;
++CREATE ROLE regress_user_mvtest PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_user_mvtest;
+ -- this test case also checks for ambiguity in the queries issued by
+ -- refresh_by_match_merge(), by choosing column names that intentionally
+@@ -266,7 +266,7 @@ ROLLBACK;
+ 
+ -- INSERT privileges if relation owner is not allowed to insert.
+ CREATE SCHEMA matview_schema;
+-CREATE USER regress_matview_user;
++CREATE USER regress_matview_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_matview_user
+   REVOKE INSERT ON TABLES FROM regress_matview_user;
+ GRANT ALL ON SCHEMA matview_schema TO public;
+diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql
+index 5ddcca84f8..99f4cef9ef 100644
+--- a/src/test/regress/sql/merge.sql
++++ b/src/test/regress/sql/merge.sql
+@@ -2,9 +2,9 @@
+ -- MERGE
+ --
+ 
+-CREATE USER regress_merge_privs;
+-CREATE USER regress_merge_no_privs;
+-CREATE USER regress_merge_none;
++CREATE USER regress_merge_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_no_privs PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_merge_none PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ DROP TABLE IF EXISTS target;
+ DROP TABLE IF EXISTS source;
+diff --git a/src/test/regress/sql/misc.sql b/src/test/regress/sql/misc.sql
+index 165a2e175f..08d7096e2c 100644
+--- a/src/test/regress/sql/misc.sql
++++ b/src/test/regress/sql/misc.sql
+@@ -74,22 +74,26 @@ DROP TABLE tmp;
+ -- copy
+ --
+ \set filename :abs_builddir '/results/onek.data'
+-COPY onek TO :'filename';
++\set command '\\copy onek TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE onek_copy (LIKE onek);
+ 
+-COPY onek_copy FROM :'filename';
++\set command '\\copy onek_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM onek EXCEPT ALL SELECT * FROM onek_copy;
+ 
+ SELECT * FROM onek_copy EXCEPT ALL SELECT * FROM onek;
+ 
+ \set filename :abs_builddir '/results/stud_emp.data'
+-COPY BINARY stud_emp TO :'filename';
++\set command '\\COPY BINARY stud_emp TO ' :'filename';
++:command
+ 
+ CREATE TEMP TABLE stud_emp_copy (LIKE stud_emp);
+ 
+-COPY BINARY stud_emp_copy FROM :'filename';
++\set command '\\COPY BINARY stud_emp_copy FROM ' :'filename';
++:command
+ 
+ SELECT * FROM stud_emp_copy;
+ 
+diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql
+index 76470fcb3f..09746de223 100644
+--- a/src/test/regress/sql/misc_functions.sql
++++ b/src/test/regress/sql/misc_functions.sql
+@@ -82,7 +82,7 @@ SELECT pg_log_backend_memory_contexts(pg_backend_pid());
+ SELECT pg_log_backend_memory_contexts(pid) FROM pg_stat_activity
+   WHERE backend_type = 'checkpointer';
+ 
+-CREATE ROLE regress_log_memory;
++CREATE ROLE regress_log_memory PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_function_privilege('regress_log_memory',
+   'pg_log_backend_memory_contexts(integer)', 'EXECUTE'); -- no
+@@ -169,7 +169,7 @@ select count(*) > 0 from
+ --
+ -- Test replication slot directory functions
+ --
+-CREATE ROLE regress_slot_dir_funcs;
++CREATE ROLE regress_slot_dir_funcs PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- Not available by default.
+ SELECT has_function_privilege('regress_slot_dir_funcs',
+   'pg_ls_logicalsnapdir()', 'EXECUTE');
+@@ -252,7 +252,7 @@ FROM pg_walfile_name_offset('0/0'::pg_lsn + :segment_size - 1),
+      pg_split_walfile_name(file_name);
+ 
+ -- pg_current_logfile
+-CREATE ROLE regress_current_logfile;
++CREATE ROLE regress_current_logfile PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- not available by default
+ SELECT has_function_privilege('regress_current_logfile',
+   'pg_current_logfile()', 'EXECUTE');
+diff --git a/src/test/regress/sql/multirangetypes.sql b/src/test/regress/sql/multirangetypes.sql
+index 41d5524285..373be031a2 100644
+--- a/src/test/regress/sql/multirangetypes.sql
++++ b/src/test/regress/sql/multirangetypes.sql
+@@ -704,7 +704,7 @@ drop type textrange2;
+ -- Multiranges don't have their own ownership or permissions.
+ --
+ create type textrange1 as range(subtype=text, multirange_type_name=multitextrange1, collation="C");
+-create role regress_multirange_owner;
++create role regress_multirange_owner password NEON_PASSWORD_PLACEHOLDER;
+ 
+ alter type multitextrange1 owner to regress_multirange_owner;  -- fail
+ alter type textrange1 owner to regress_multirange_owner;
+diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql
+index 1a6c61f49d..1c31ac6a53 100644
+--- a/src/test/regress/sql/object_address.sql
++++ b/src/test/regress/sql/object_address.sql
+@@ -7,7 +7,7 @@ SET client_min_messages TO 'warning';
+ DROP ROLE IF EXISTS regress_addr_user;
+ RESET client_min_messages;
+ 
+-CREATE USER regress_addr_user;
++CREATE USER regress_addr_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Test generic object addressing/identification functions
+ CREATE SCHEMA addr_nsp;
+diff --git a/src/test/regress/sql/password.sql b/src/test/regress/sql/password.sql
+index bb82aa4aa2..7424c91b10 100644
+--- a/src/test/regress/sql/password.sql
++++ b/src/test/regress/sql/password.sql
+@@ -10,13 +10,13 @@ SET password_encryption = 'scram-sha-256'; -- ok
+ 
+ -- consistency of password entries
+ SET password_encryption = 'md5';
+-CREATE ROLE regress_passwd1;
+-ALTER ROLE regress_passwd1 PASSWORD 'role_pwd1';
+-CREATE ROLE regress_passwd2;
+-ALTER ROLE regress_passwd2 PASSWORD 'role_pwd2';
++CREATE ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++ALTER ROLE regress_passwd1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET password_encryption = 'scram-sha-256';
+-CREATE ROLE regress_passwd3 PASSWORD 'role_pwd3';
+-CREATE ROLE regress_passwd4 PASSWORD NULL;
++CREATE ROLE regress_passwd3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- check list of created entries
+ --
+@@ -44,14 +44,14 @@ ALTER ROLE regress_passwd2_new RENAME TO regress_passwd2;
+ SET password_encryption = 'md5';
+ 
+ -- encrypt with MD5
+-ALTER ROLE regress_passwd2 PASSWORD 'foo';
++ALTER ROLE regress_passwd2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted, use as they are
+ ALTER ROLE regress_passwd1 PASSWORD 'md5cd3578025fe2c3d7ed1b9a9b26238b70';
+ ALTER ROLE regress_passwd3 PASSWORD 'SCRAM-SHA-256$4096:VLK4RMaQLCvNtQ==$6YtlR4t69SguDiwFvbVgVZtuz6gpJQQqUMZ7IQJK5yI=:ps75jrHeYU4lXCcXI4O8oIdJ3eO8o2jirjruw9phBTo=';
+ 
+ SET password_encryption = 'scram-sha-256';
+ -- create SCRAM secret
+-ALTER ROLE  regress_passwd4 PASSWORD 'foo';
++ALTER ROLE  regress_passwd4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ -- already encrypted with MD5, use as it is
+ CREATE ROLE regress_passwd5 PASSWORD 'md5e73a4b11df52a6068f8b39f90be36023';
+ 
+diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
+index 5880bc018d..27aa952b18 100644
+--- a/src/test/regress/sql/privileges.sql
++++ b/src/test/regress/sql/privileges.sql
+@@ -24,18 +24,18 @@ RESET client_min_messages;
+ 
+ -- test proper begins here
+ 
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
+-CREATE USER regress_priv_user5;	-- duplicate
+-CREATE USER regress_priv_user6;
+-CREATE USER regress_priv_user7;
+-CREATE USER regress_priv_user8;
+-CREATE USER regress_priv_user9;
+-CREATE USER regress_priv_user10;
+-CREATE ROLE regress_priv_role;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;	-- duplicate
++CREATE USER regress_priv_user6 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user7 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user8 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user9 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user10 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_priv_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- circular ADMIN OPTION grants should be disallowed
+ GRANT regress_priv_user1 TO regress_priv_user2 WITH ADMIN OPTION;
+@@ -84,11 +84,11 @@ DROP ROLE regress_priv_user5; -- should fail, dependency
+ DROP ROLE regress_priv_user1, regress_priv_user5; -- ok, despite order
+ 
+ -- recreate the roles we just dropped
+-CREATE USER regress_priv_user1;
+-CREATE USER regress_priv_user2;
+-CREATE USER regress_priv_user3;
+-CREATE USER regress_priv_user4;
+-CREATE USER regress_priv_user5;
++CREATE USER regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user4 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_priv_user5 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT pg_read_all_data TO regress_priv_user6;
+ GRANT pg_write_all_data TO regress_priv_user7;
+@@ -163,8 +163,8 @@ DROP USER regress_priv_user10;
+ DROP USER regress_priv_user9;
+ DROP USER regress_priv_user8;
+ 
+-CREATE GROUP regress_priv_group1;
+-CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 USER regress_priv_user2;
++CREATE GROUP regress_priv_group1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
+ 
+ ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
+ 
+@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+ 
+ -- security-restricted operations
+ \c -
+-CREATE ROLE regress_sro_user;
++CREATE ROLE regress_sro_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- Check that index expressions and predicates are run as the table's owner
+ 
+@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
+ -- Change owner of the schema & and rename of new schema owner
+ \c -
+ 
+-CREATE ROLE regress_schemauser1 superuser login;
+-CREATE ROLE regress_schemauser2 superuser login;
++CREATE ROLE regress_schemauser1 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_schemauser2 superuser login PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION ROLE regress_schemauser1;
+ CREATE SCHEMA testns;
+@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
+ 
+ 
+ -- permissions with LOCK TABLE
+-CREATE USER regress_locktable_user;
++CREATE USER regress_locktable_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE lock_table (a int);
+ 
+ -- LOCK TABLE and SELECT permission
+@@ -1851,7 +1851,7 @@ DROP USER regress_locktable_user;
+ -- switch to superuser
+ \c -
+ 
+-CREATE ROLE regress_readallstats;
++CREATE ROLE regress_readallstats PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
+ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
+@@ -1871,10 +1871,10 @@ RESET ROLE;
+ DROP ROLE regress_readallstats;
+ 
+ -- test role grantor machinery
+-CREATE ROLE regress_group;
+-CREATE ROLE regress_group_direct_manager;
+-CREATE ROLE regress_group_indirect_manager;
+-CREATE ROLE regress_group_member;
++CREATE ROLE regress_group PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_direct_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_indirect_manager PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_group_member PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
+ GRANT regress_group_direct_manager TO regress_group_indirect_manager;
+@@ -1896,9 +1896,9 @@ DROP ROLE regress_group_indirect_manager;
+ DROP ROLE regress_group_member;
+ 
+ -- test SET and INHERIT options with object ownership changes
+-CREATE ROLE regress_roleoption_protagonist;
+-CREATE ROLE regress_roleoption_donor;
+-CREATE ROLE regress_roleoption_recipient;
++CREATE ROLE regress_roleoption_protagonist PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_donor PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_roleoption_recipient PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE SCHEMA regress_roleoption;
+ GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
+ GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
+@@ -1926,9 +1926,9 @@ DROP ROLE regress_roleoption_donor;
+ DROP ROLE regress_roleoption_recipient;
+ 
+ -- MAINTAIN
+-CREATE ROLE regress_no_maintain;
+-CREATE ROLE regress_maintain;
+-CREATE ROLE regress_maintain_all IN ROLE pg_maintain;
++CREATE ROLE regress_no_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_maintain_all IN ROLE pg_maintain PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ CREATE TABLE maintain_test (a INT);
+ CREATE INDEX ON maintain_test (a);
+ GRANT MAINTAIN ON maintain_test TO regress_maintain;
+diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
+index 3b3c6f6e29..b09d6231f8 100644
+--- a/src/test/regress/sql/psql.sql
++++ b/src/test/regress/sql/psql.sql
+@@ -500,7 +500,7 @@ select 1 where false;
+ \pset expanded off
+ 
+ CREATE SCHEMA tableam_display;
+-CREATE ROLE regress_display_role;
++CREATE ROLE regress_display_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER SCHEMA tableam_display OWNER TO regress_display_role;
+ SET search_path TO tableam_display;
+ CREATE ACCESS METHOD heap_psql TYPE TABLE HANDLER heap_tableam_handler;
+@@ -1182,7 +1182,7 @@ reset debug_parallel_query;
+ \unset FETCH_COUNT
+ 
+ create schema testpart;
+-create role regress_partitioning_role;
++create role regress_partitioning_role PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ alter schema testpart owner to regress_partitioning_role;
+ 
+@@ -1293,7 +1293,7 @@ reset work_mem;
+ 
+ -- check \df+
+ -- we have to use functions with a predictable owner name, so make a role
+-create role regress_psql_user superuser;
++create role regress_psql_user superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ begin;
+ set session authorization regress_psql_user;
+ 
+@@ -1439,11 +1439,14 @@ CREATE TEMPORARY TABLE reload_output(
+ );
+ 
+ SELECT 1 AS a \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT 2 AS b\; SELECT 3 AS c\; SELECT 4 AS d \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ COPY (SELECT 'foo') TO STDOUT \; COPY (SELECT 'bar') TO STDOUT \g :g_out_file
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ 
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+@@ -1460,17 +1463,20 @@ SELECT 1 AS a\; SELECT 2 AS b\; SELECT 3 AS c;
+ -- COPY TO file
+ -- The data goes to :g_out_file and the status to :o_out_file
+ \set QUIET false
+-COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO :'g_out_file';
++\set command '\\COPY (SELECT unique1 FROM onek ORDER BY unique1 LIMIT 10) TO ' :'g_out_file';
++:command
+ -- DML command status
+ UPDATE onek SET unique1 = unique1 WHERE false;
+ \set QUIET true
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+ 
+@@ -1483,10 +1489,12 @@ COPY (SELECT 'foo2') TO STDOUT \; COPY (SELECT 'bar2') TO STDOUT \g :g_out_file
+ \o
+ 
+ -- Check the contents of the files generated.
+-COPY reload_output(line) FROM :'g_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'g_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ TRUNCATE TABLE reload_output;
+-COPY reload_output(line) FROM :'o_out_file';
++\set command '\\COPY reload_output(line) FROM ' :'o_out_file';
++:command
+ SELECT line FROM reload_output ORDER BY lineno;
+ 
+ DROP TABLE reload_output;
+@@ -1834,10 +1842,10 @@ DROP FUNCTION psql_error;
+ \dX "no.such.database"."no.such.schema"."no.such.extended.statistics"
+ 
+ -- check \drg and \du
+-CREATE ROLE regress_du_role0;
+-CREATE ROLE regress_du_role1;
+-CREATE ROLE regress_du_role2;
+-CREATE ROLE regress_du_admin;
++CREATE ROLE regress_du_role0 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_du_admin PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_du_role0 TO regress_du_admin WITH ADMIN TRUE;
+ GRANT regress_du_role1 TO regress_du_admin WITH ADMIN TRUE;
+diff --git a/src/test/regress/sql/publication.sql b/src/test/regress/sql/publication.sql
+index 479d4f3264..6d348a93e7 100644
+--- a/src/test/regress/sql/publication.sql
++++ b/src/test/regress/sql/publication.sql
+@@ -1,9 +1,9 @@
+ --
+ -- PUBLICATION
+ --
+-CREATE ROLE regress_publication_user LOGIN SUPERUSER;
+-CREATE ROLE regress_publication_user2;
+-CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_publication_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_publication_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_publication_user';
+ 
+ -- suppress warning that depends on wal_level
+@@ -810,7 +810,7 @@ DROP PUBLICATION testpub2;
+ DROP PUBLICATION testpub3;
+ 
+ SET ROLE regress_publication_user;
+-CREATE ROLE regress_publication_user3;
++CREATE ROLE regress_publication_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT regress_publication_user2 TO regress_publication_user3;
+ SET client_min_messages = 'ERROR';
+ CREATE PUBLICATION testpub4 FOR TABLES IN SCHEMA pub_test;
+diff --git a/src/test/regress/sql/regproc.sql b/src/test/regress/sql/regproc.sql
+index 232289ac39..d967ef0cd3 100644
+--- a/src/test/regress/sql/regproc.sql
++++ b/src/test/regress/sql/regproc.sql
+@@ -4,7 +4,7 @@
+ 
+ /* If objects exist, return oids */
+ 
+-CREATE ROLE regress_regrole_test;
++CREATE ROLE regress_regrole_test PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- without schemaname
+ 
+diff --git a/src/test/regress/sql/roleattributes.sql b/src/test/regress/sql/roleattributes.sql
+index c961b2d730..0859b89c4f 100644
+--- a/src/test/regress/sql/roleattributes.sql
++++ b/src/test/regress/sql/roleattributes.sql
+@@ -1,83 +1,83 @@
+ -- default for superuser is false
+-CREATE ROLE regress_test_def_superuser;
++CREATE ROLE regress_test_def_superuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
+-CREATE ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_superuser';
++CREATE ROLE regress_test_superuser WITH SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH NOSUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ ALTER ROLE regress_test_superuser WITH SUPERUSER;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_superuser';
+ 
+ -- default for inherit is true
+-CREATE ROLE regress_test_def_inherit;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
+-CREATE ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++CREATE ROLE regress_test_def_inherit PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_inherit';
++CREATE ROLE regress_test_inherit WITH NOINHERIT PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH INHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ ALTER ROLE regress_test_inherit WITH NOINHERIT;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_inherit';
+ 
+ -- default for create role is false
+-CREATE ROLE regress_test_def_createrole;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
+-CREATE ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++CREATE ROLE regress_test_def_createrole PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createrole';
++CREATE ROLE regress_test_createrole WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH NOCREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ ALTER ROLE regress_test_createrole WITH CREATEROLE;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createrole';
+ 
+ -- default for create database is false
+-CREATE ROLE regress_test_def_createdb;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
+-CREATE ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++CREATE ROLE regress_test_def_createdb PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_createdb';
++CREATE ROLE regress_test_createdb WITH CREATEDB PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH NOCREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ ALTER ROLE regress_test_createdb WITH CREATEDB;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_createdb';
+ 
+ -- default for can login is false for role
+-CREATE ROLE regress_test_def_role_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
+-CREATE ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++CREATE ROLE regress_test_def_role_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_role_canlogin';
++CREATE ROLE regress_test_role_canlogin WITH LOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ ALTER ROLE regress_test_role_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_role_canlogin';
+ 
+ -- default for can login is true for user
+-CREATE USER regress_test_def_user_canlogin;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
+-CREATE USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++CREATE USER regress_test_def_user_canlogin PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_user_canlogin';
++CREATE USER regress_test_user_canlogin WITH NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH LOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ ALTER USER regress_test_user_canlogin WITH NOLOGIN;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_user_canlogin';
+ 
+ -- default for replication is false
+-CREATE ROLE regress_test_def_replication;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
+-CREATE ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++CREATE ROLE regress_test_def_replication PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_replication';
++CREATE ROLE regress_test_replication WITH REPLICATION PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH NOREPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ ALTER ROLE regress_test_replication WITH REPLICATION;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_replication';
+ 
+ -- default for bypassrls is false
+-CREATE ROLE regress_test_def_bypassrls;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
+-CREATE ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++CREATE ROLE regress_test_def_bypassrls PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_def_bypassrls';
++CREATE ROLE regress_test_bypassrls WITH BYPASSRLS PASSWORD NEON_PASSWORD_PLACEHOLDER;
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH NOBYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ ALTER ROLE regress_test_bypassrls WITH BYPASSRLS;
+-SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, rolpassword, rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
++SELECT rolname, rolsuper, rolinherit, rolcreaterole, rolcreatedb, rolcanlogin, rolreplication, rolbypassrls, rolconnlimit, regexp_replace(rolpassword, '(SCRAM-SHA-256)\$(\d+):([a-zA-Z0-9+/=]+)\$([a-zA-Z0-9+=/]+):([a-zA-Z0-9+/=]+)', '\1$\2:<salt>$<storedkey>:<serverkey>'), rolvaliduntil FROM pg_authid WHERE rolname = 'regress_test_bypassrls';
+ 
+ -- clean up roles
+ DROP ROLE regress_test_def_superuser;
+diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
+index eab7d99003..0cf1139e01 100644
+--- a/src/test/regress/sql/rowsecurity.sql
++++ b/src/test/regress/sql/rowsecurity.sql
+@@ -20,13 +20,13 @@ DROP SCHEMA IF EXISTS regress_rls_schema CASCADE;
+ RESET client_min_messages;
+ 
+ -- initial setup
+-CREATE USER regress_rls_alice NOLOGIN;
+-CREATE USER regress_rls_bob NOLOGIN;
+-CREATE USER regress_rls_carol NOLOGIN;
+-CREATE USER regress_rls_dave NOLOGIN;
+-CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN;
+-CREATE ROLE regress_rls_group1 NOLOGIN;
+-CREATE ROLE regress_rls_group2 NOLOGIN;
++CREATE USER regress_rls_alice NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_bob NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_carol NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_dave NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_rls_exempt_user BYPASSRLS NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group1 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_group2 NOLOGIN PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ GRANT regress_rls_group1 TO regress_rls_bob;
+ GRANT regress_rls_group2 TO regress_rls_carol;
+@@ -2105,8 +2105,8 @@ SELECT count(*) = 0 FROM pg_depend
+ -- DROP OWNED BY testing
+ RESET SESSION AUTHORIZATION;
+ 
+-CREATE ROLE regress_rls_dob_role1;
+-CREATE ROLE regress_rls_dob_role2;
++CREATE ROLE regress_rls_dob_role1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_rls_dob_role2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE dob_t1 (c1 int);
+ CREATE TABLE dob_t2 (c1 int) PARTITION BY RANGE (c1);
+diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
+index 4a5fa50585..a9e9eab77d 100644
+--- a/src/test/regress/sql/rules.sql
++++ b/src/test/regress/sql/rules.sql
+@@ -1390,7 +1390,7 @@ DROP TABLE ruletest2;
+ -- Test non-SELECT rule on security invoker view.
+ -- Should use view owner's permissions.
+ --
+-CREATE USER regress_rule_user1;
++CREATE USER regress_rule_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE ruletest_t1 (x int);
+ CREATE TABLE ruletest_t2 (x int);
+diff --git a/src/test/regress/sql/security_label.sql b/src/test/regress/sql/security_label.sql
+index 98e6a5f211..68c868fef2 100644
+--- a/src/test/regress/sql/security_label.sql
++++ b/src/test/regress/sql/security_label.sql
+@@ -10,8 +10,8 @@ DROP ROLE IF EXISTS regress_seclabel_user2;
+ 
+ RESET client_min_messages;
+ 
+-CREATE USER regress_seclabel_user1 WITH CREATEROLE;
+-CREATE USER regress_seclabel_user2;
++CREATE USER regress_seclabel_user1 WITH CREATEROLE PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_seclabel_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE TABLE seclabel_tbl1 (a int, b text);
+ CREATE TABLE seclabel_tbl2 (x int, y text);
+diff --git a/src/test/regress/sql/select_into.sql b/src/test/regress/sql/select_into.sql
+index 689c448cc2..223ceb1d75 100644
+--- a/src/test/regress/sql/select_into.sql
++++ b/src/test/regress/sql/select_into.sql
+@@ -20,7 +20,7 @@ DROP TABLE sitmp1;
+ -- SELECT INTO and INSERT permission, if owner is not allowed to insert.
+ --
+ CREATE SCHEMA selinto_schema;
+-CREATE USER regress_selinto_user;
++CREATE USER regress_selinto_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ ALTER DEFAULT PRIVILEGES FOR ROLE regress_selinto_user
+ 	  REVOKE INSERT ON TABLES FROM regress_selinto_user;
+ GRANT ALL ON SCHEMA selinto_schema TO public;
+diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
+index 3e4bfcb71f..99757eff3c 100644
+--- a/src/test/regress/sql/select_parallel.sql
++++ b/src/test/regress/sql/select_parallel.sql
+@@ -498,7 +498,7 @@ SELECT 1 FROM tenk1_vw_sec
+ rollback;
+ 
+ -- test that function option SET ROLE works in parallel workers.
+-create role regress_parallel_worker;
++create role regress_parallel_worker PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ create function set_and_report_role() returns text as
+   $$ select current_setting('role') $$ language sql parallel safe
+diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
+index e742f13699..7bd0255df8 100644
+--- a/src/test/regress/sql/select_views.sql
++++ b/src/test/regress/sql/select_views.sql
+@@ -12,7 +12,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
+ --
+ -- Test for Leaky view scenario
+ --
+-CREATE ROLE regress_alice;
++CREATE ROLE regress_alice PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ CREATE FUNCTION f_leak (text)
+        RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
+index 793f1415f6..ec07c1f193 100644
+--- a/src/test/regress/sql/sequence.sql
++++ b/src/test/regress/sql/sequence.sql
+@@ -293,7 +293,7 @@ ROLLBACK;
+ 
+ -- privileges tests
+ 
+-CREATE USER regress_seq_user;
++CREATE USER regress_seq_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ -- nextval
+ BEGIN;
+diff --git a/src/test/regress/sql/stats.sql b/src/test/regress/sql/stats.sql
+index d8ac0d06f4..c9cfcea208 100644
+--- a/src/test/regress/sql/stats.sql
++++ b/src/test/regress/sql/stats.sql
+@@ -631,23 +631,6 @@ SELECT :io_sum_shared_after_writes > :io_sum_shared_before_writes;
+ SELECT current_setting('fsync') = 'off'
+   OR :io_sum_shared_after_fsyncs > :io_sum_shared_before_fsyncs;
+ 
+--- Change the tablespace so that the table is rewritten directly, then SELECT
+--- from it to cause it to be read back into shared buffers.
+-SELECT sum(reads) AS io_sum_shared_before_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+--- Do this in a transaction to prevent spurious failures due to concurrent accesses to our newly
+--- rewritten table, e.g. by autovacuum.
+-BEGIN;
+-ALTER TABLE test_io_shared SET TABLESPACE regress_tblspace;
+--- SELECT from the table so that the data is read into shared buffers and
+--- context 'normal', object 'relation' reads are counted.
+-SELECT COUNT(*) FROM test_io_shared;
+-COMMIT;
+-SELECT pg_stat_force_next_flush();
+-SELECT sum(reads) AS io_sum_shared_after_reads
+-  FROM pg_stat_io WHERE context = 'normal' AND object = 'relation'  \gset
+-SELECT :io_sum_shared_after_reads > :io_sum_shared_before_reads;
+-
+ SELECT sum(hits) AS io_sum_shared_before_hits
+   FROM pg_stat_io WHERE context = 'normal' AND object = 'relation' \gset
+ -- Select from the table again to count hits.
+diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
+index 0c08a6cc42..7a5b1036d8 100644
+--- a/src/test/regress/sql/stats_ext.sql
++++ b/src/test/regress/sql/stats_ext.sql
+@@ -50,7 +50,7 @@ DROP TABLE ext_stats_test;
+ CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
+ CREATE STATISTICS IF NOT EXISTS ab1_a_b_stats ON a, b FROM ab1;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'new comment';
+-CREATE ROLE regress_stats_ext;
++CREATE ROLE regress_stats_ext PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION regress_stats_ext;
+ COMMENT ON STATISTICS ab1_a_b_stats IS 'changed comment';
+ DROP STATISTICS ab1_a_b_stats;
+@@ -1607,7 +1607,7 @@ drop statistics stts_t1_expr_expr_stat;
+ set search_path to public, stts_s1;
+ \dX
+ 
+-create role regress_stats_ext nosuperuser;
++create role regress_stats_ext nosuperuser PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ set role regress_stats_ext;
+ \dX
+ reset role;
+@@ -1618,7 +1618,7 @@ drop user regress_stats_ext;
+ reset search_path;
+ 
+ -- User with no access
+-CREATE USER regress_stats_user1;
++CREATE USER regress_stats_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT USAGE ON SCHEMA tststats TO regress_stats_user1;
+ SET SESSION AUTHORIZATION regress_stats_user1;
+ SELECT * FROM tststats.priv_test_tbl; -- Permission denied
+diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql
+index 3e5ba4cb8c..a35f030908 100644
+--- a/src/test/regress/sql/subscription.sql
++++ b/src/test/regress/sql/subscription.sql
+@@ -2,10 +2,10 @@
+ -- SUBSCRIPTION
+ --
+ 
+-CREATE ROLE regress_subscription_user LOGIN SUPERUSER;
+-CREATE ROLE regress_subscription_user2;
+-CREATE ROLE regress_subscription_user3 IN ROLE pg_create_subscription;
+-CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER;
++CREATE ROLE regress_subscription_user LOGIN SUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE ROLE regress_subscription_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER IN ROLE pg_create_subscription;
++CREATE ROLE regress_subscription_user_dummy LOGIN NOSUPERUSER PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET SESSION AUTHORIZATION 'regress_subscription_user';
+ 
+ -- fail - no publications
+diff --git a/src/test/regress/sql/test_setup.sql b/src/test/regress/sql/test_setup.sql
+index 06b0e2121f..01444f9426 100644
+--- a/src/test/regress/sql/test_setup.sql
++++ b/src/test/regress/sql/test_setup.sql
+@@ -135,7 +135,8 @@ CREATE TABLE onek (
+ );
+ 
+ \set filename :abs_srcdir '/data/onek.data'
+-COPY onek FROM :'filename';
++\set command '\\copy onek FROM ' :'filename';
++:command
+ VACUUM ANALYZE onek;
+ 
+ CREATE TABLE onek2 AS SELECT * FROM onek;
+@@ -161,7 +162,8 @@ CREATE TABLE tenk1 (
+ );
+ 
+ \set filename :abs_srcdir '/data/tenk.data'
+-COPY tenk1 FROM :'filename';
++\set command '\\copy tenk1 FROM ' :'filename';
++:command
+ VACUUM ANALYZE tenk1;
+ 
+ CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+@@ -174,7 +176,8 @@ CREATE TABLE person (
+ );
+ 
+ \set filename :abs_srcdir '/data/person.data'
+-COPY person FROM :'filename';
++\set command '\\copy person FROM ' :'filename';
++:command
+ VACUUM ANALYZE person;
+ 
+ CREATE TABLE emp (
+@@ -183,7 +186,8 @@ CREATE TABLE emp (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/emp.data'
+-COPY emp FROM :'filename';
++\set command '\\copy emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE emp;
+ 
+ CREATE TABLE student (
+@@ -191,7 +195,8 @@ CREATE TABLE student (
+ ) INHERITS (person);
+ 
+ \set filename :abs_srcdir '/data/student.data'
+-COPY student FROM :'filename';
++\set command '\\copy student FROM ' :'filename';
++:command
+ VACUUM ANALYZE student;
+ 
+ CREATE TABLE stud_emp (
+@@ -199,7 +204,8 @@ CREATE TABLE stud_emp (
+ ) INHERITS (emp, student);
+ 
+ \set filename :abs_srcdir '/data/stud_emp.data'
+-COPY stud_emp FROM :'filename';
++\set command '\\copy stud_emp FROM ' :'filename';
++:command
+ VACUUM ANALYZE stud_emp;
+ 
+ CREATE TABLE road (
+@@ -208,7 +214,8 @@ CREATE TABLE road (
+ );
+ 
+ \set filename :abs_srcdir '/data/streets.data'
+-COPY road FROM :'filename';
++\set command '\\copy road FROM ' :'filename';
++:command
+ VACUUM ANALYZE road;
+ 
+ CREATE TABLE ihighway () INHERITS (road);
+diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
+index fbd26cdba4..7ec2d78eee 100644
+--- a/src/test/regress/sql/tsearch.sql
++++ b/src/test/regress/sql/tsearch.sql
+@@ -49,7 +49,8 @@ CREATE TABLE test_tsvector(
+ );
+ 
+ \set filename :abs_srcdir '/data/tsearch.data'
+-COPY test_tsvector FROM :'filename';
++\set command '\\copy test_tsvector FROM ' :'filename';
++:command
+ 
+ ANALYZE test_tsvector;
+ 
+diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
+index 93b693ae83..2983475265 100644
+--- a/src/test/regress/sql/updatable_views.sql
++++ b/src/test/regress/sql/updatable_views.sql
+@@ -569,9 +569,9 @@ DROP TABLE base_tbl CASCADE;
+ 
+ -- permissions checks
+ 
+-CREATE USER regress_view_user1;
+-CREATE USER regress_view_user2;
+-CREATE USER regress_view_user3;
++CREATE USER regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++CREATE USER regress_view_user3 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ SET SESSION AUTHORIZATION regress_view_user1;
+ CREATE TABLE base_tbl(a int, b text, c float);
+@@ -1909,8 +1909,8 @@ drop view uv_iocu_view;
+ drop table uv_iocu_tab;
+ 
+ -- ON CONFLICT DO UPDATE permissions checks
+-create user regress_view_user1;
+-create user regress_view_user2;
++create user regress_view_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER;
++create user regress_view_user2 PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ 
+ set session authorization regress_view_user1;
+ create table base_tbl(a int unique, b text, c float);
+diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
+index 8b4707eb9c..b9041f8134 100644
+--- a/src/test/regress/sql/update.sql
++++ b/src/test/regress/sql/update.sql
+@@ -342,7 +342,7 @@ DROP FUNCTION func_parted_mod_b();
+ -----------------------------------------
+ 
+ ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+-CREATE USER regress_range_parted_user;
++CREATE USER regress_range_parted_user PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+ CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+ CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql
+index 548cd7acca..5b15d4dab0 100644
+--- a/src/test/regress/sql/vacuum.sql
++++ b/src/test/regress/sql/vacuum.sql
+@@ -335,7 +335,7 @@ CREATE TABLE vacowned (a int);
+ CREATE TABLE vacowned_parted (a int) PARTITION BY LIST (a);
+ CREATE TABLE vacowned_part1 PARTITION OF vacowned_parted FOR VALUES IN (1);
+ CREATE TABLE vacowned_part2 PARTITION OF vacowned_parted FOR VALUES IN (2);
+-CREATE ROLE regress_vacuum;
++CREATE ROLE regress_vacuum PASSWORD NEON_PASSWORD_PLACEHOLDER;
+ SET ROLE regress_vacuum;
+ -- Simple table
+ VACUUM vacowned;
diff --git a/test_runner/cloud_regress/README.md b/test_runner/cloud_regress/README.md
new file mode 100644
index 0000000000..9c460e2764
--- /dev/null
+++ b/test_runner/cloud_regress/README.md
@@ -0,0 +1,21 @@
+# How to run the `pg_regress` tests on a cloud Neon instance.
+
+* Create a Neon project on staging.
+* Grant the superuser privileges to the DB user.
+* (Optional) create a branch for testing
+* Configure the endpoint by updating the control-plane database with the following settings:
+  * `Timeone`: `America/Los_Angeles`
+  * `DateStyle`: `Postgres,MDY`
+  * `compute_query_id`: `off`
+* Checkout the actual `Neon` sources
+* Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
+```bash
+$ cd vendor/postgres-v17
+$ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
+```
+* Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
+* Set the environment variable `PG_VERSION` to the version of your project.
+* Run 
+```bash
+$ pytest -m remote_cluster -k cloud_regress
+```
\ No newline at end of file
diff --git a/test_runner/cloud_regress/test_cloud_regress.py b/test_runner/cloud_regress/test_cloud_regress.py
index 715d4a4881..63427c1912 100644
--- a/test_runner/cloud_regress/test_cloud_regress.py
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -5,68 +5,15 @@ Run the regression tests on the cloud instance of Neon
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any
 
-import psycopg2
 import pytest
-from fixtures.log_helper import log
 from fixtures.neon_fixtures import RemotePostgres
 from fixtures.pg_version import PgVersion
 
 
-@pytest.fixture
-def setup(remote_pg: RemotePostgres):
-    """
-    Setup and teardown of the tests
-    """
-    with psycopg2.connect(remote_pg.connstr()) as conn:
-        with conn.cursor() as cur:
-            log.info("Creating the extension")
-            cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
-            conn.commit()
-            # TODO: Migrate to branches and remove this code
-            log.info("Looking for subscriptions in the regress database")
-            cur.execute(
-                "SELECT subname FROM pg_catalog.pg_subscription WHERE "
-                "subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
-            )
-            if cur.rowcount > 0:
-                with psycopg2.connect(
-                    dbname="regression",
-                    host=remote_pg.default_options["host"],
-                    user=remote_pg.default_options["user"],
-                    password=remote_pg.default_options["password"],
-                ) as regress_conn:
-                    with regress_conn.cursor() as regress_cur:
-                        for sub in cur:
-                            regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
-                            regress_cur.execute(
-                                f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
-                            )
-                            regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
-                        regress_conn.commit()
-
-    yield
-    # TODO: Migrate to branches and remove this code
-    log.info("Looking for extra roles...")
-    with psycopg2.connect(remote_pg.connstr()) as conn:
-        with conn.cursor() as cur:
-            cur.execute(
-                "SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
-            )
-            roles: list[Any] = []
-            for role in cur:
-                log.info("Role found: %s", role[0])
-                roles.append(role[0])
-            for role in roles:
-                cur.execute(f"DROP ROLE {role}")
-            conn.commit()
-
-
 @pytest.mark.timeout(7200)
 @pytest.mark.remote_cluster
 def test_cloud_regress(
-    setup,
     remote_pg: RemotePostgres,
     pg_version: PgVersion,
     pg_distrib_dir: Path,

From 4c4cb80186de6d319716897117eaf5c434d5180f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 9 Dec 2024 15:06:06 -0500
Subject: [PATCH 090/117] fix(pageserver): fix gc-compaction racing with legacy
 gc (#10052)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

close https://github.com/neondatabase/neon/issues/10049, close
https://github.com/neondatabase/neon/issues/10030, close
https://github.com/neondatabase/neon/issues/8861

part of https://github.com/neondatabase/neon/issues/9114

The legacy gc process calls `get_latest_gc_cutoff`, which uses a Rcu
different than the gc_info struct. In the gc_compaction_smoke test case,
the "latest" cutoff could be lower than the gc_info struct, causing
gc-compaction to collect data that could be accessed by
`latest_gc_cutoff`. Technically speaking, there's nothing wrong with
gc-compaction using gc_info without considering latest_gc_cutoff,
because gc_info is the source of truth. But anyways, let's fix it.

## Summary of changes

* gc-compaction uses `latest_gc_cutoff` instead of gc_info to determine
the gc horizon.
* if a gc-compaction is scheduled via tenant compaction iteration, it
will take the gc_block lock to avoid racing with functionalities like
detach ancestor (if it's triggered via manual compaction API without
scheduling, then it won't take the lock)

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/http/routes.rs                |  2 +-
 pageserver/src/tenant.rs                     | 96 ++++++++++++++++++--
 pageserver/src/tenant/gc_block.rs            | 12 +--
 pageserver/src/tenant/timeline/compaction.rs | 16 +++-
 test_runner/regress/test_compaction.py       | 33 +++----
 5 files changed, 121 insertions(+), 38 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0f11bbc507..75d25d0a6a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2061,7 +2061,7 @@ async fn timeline_compact_handler(
             let tenant = state
                 .tenant_manager
                 .get_attached_tenant_shard(tenant_shard_id)?;
-            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
             if wait_until_scheduled_compaction_done {
                 // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
                 rx.await.ok();
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4a9c44aefd..e71a56ed40 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3028,14 +3028,23 @@ impl Tenant {
                                 let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                                 let tline_pending_tasks = guard.entry(*timeline_id).or_default();
                                 for (idx, job) in jobs.into_iter().enumerate() {
-                                    tline_pending_tasks.push_back(ScheduledCompactionTask {
-                                        options: job,
-                                        result_tx: if idx == jobs_len - 1 {
-                                            // The last compaction job sends the completion signal
-                                            next_scheduled_compaction_task.result_tx.take()
-                                        } else {
-                                            None
-                                        },
+                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            // The last job in the queue sends the signal and releases the gc guard
+                                            result_tx: next_scheduled_compaction_task
+                                                .result_tx
+                                                .take(),
+                                            gc_block: next_scheduled_compaction_task
+                                                .gc_block
+                                                .take(),
+                                        }
+                                    } else {
+                                        ScheduledCompactionTask {
+                                            options: job,
+                                            result_tx: None,
+                                            gc_block: None,
+                                        }
                                     });
                                 }
                                 info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
@@ -3095,15 +3104,22 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         options: CompactOptions,
-    ) -> tokio::sync::oneshot::Receiver<()> {
+    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
+        let gc_guard = match self.gc_block.start().await {
+            Ok(guard) => guard,
+            Err(e) => {
+                bail!("cannot run gc-compaction because gc is blocked: {}", e);
+            }
+        };
         let (tx, rx) = tokio::sync::oneshot::channel();
         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
         let tline_pending_tasks = guard.entry(timeline_id).or_default();
         tline_pending_tasks.push_back(ScheduledCompactionTask {
             options,
             result_tx: Some(tx),
+            gc_block: Some(gc_guard),
         });
-        rx
+        Ok(rx)
     }
 
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
@@ -8150,6 +8166,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x30);
@@ -8252,6 +8274,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x40);
@@ -8632,6 +8660,12 @@ mod tests {
                 .await?
         };
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -8713,6 +8747,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x40))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x40);
@@ -9160,6 +9200,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9302,6 +9348,12 @@ mod tests {
 
         // increase GC horizon and compact again
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x38))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             guard.cutoffs.time = Lsn(0x38);
@@ -9397,6 +9449,12 @@ mod tests {
             )
             .await?;
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9641,6 +9699,12 @@ mod tests {
         branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
 
         {
+            parent_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x10))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = parent_tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9655,6 +9719,12 @@ mod tests {
         }
 
         {
+            branch_tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x50))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = branch_tline.gc_info.write().unwrap();
             *guard = GcInfo {
@@ -9984,6 +10054,12 @@ mod tests {
             .await?;
 
         {
+            tline
+                .latest_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
             *guard = GcInfo {
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index 373779ddb8..af73acb2be 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, sync::Arc};
 
 use utils::id::TimelineId;
 
@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
     /// Do not add any more features taking and forbidding taking this lock. It should be
     /// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
     /// synchronizes with gc attempts by locking and unlocking this mutex.
-    blocking: tokio::sync::Mutex<()>,
+    blocking: Arc<tokio::sync::Mutex<()>>,
 }
 
 impl GcBlock {
@@ -30,7 +30,7 @@ impl GcBlock {
     /// it's ending, or if not currently possible, a value describing the reasons why not.
     ///
     /// Cancellation safe.
-    pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
+    pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
         let reasons = {
             let g = self.reasons.lock().unwrap();
 
@@ -44,7 +44,7 @@ impl GcBlock {
             Err(reasons)
         } else {
             Ok(Guard {
-                _inner: self.blocking.lock().await,
+                _inner: self.blocking.clone().lock_owned().await,
             })
         }
     }
@@ -170,8 +170,8 @@ impl GcBlock {
     }
 }
 
-pub(super) struct Guard<'a> {
-    _inner: tokio::sync::MutexGuard<'a, ()>,
+pub(crate) struct Guard {
+    _inner: tokio::sync::OwnedMutexGuard<()>,
 }
 
 #[derive(Debug)]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index a18e157d37..fa924d23b0 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -41,7 +41,7 @@ use crate::tenant::storage_layer::{
 use crate::tenant::timeline::ImageLayerCreationOutcome;
 use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
-use crate::tenant::{DeltaLayer, MaybeOffloaded};
+use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use pageserver_api::config::tenant_conf_defaults::{
     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
@@ -63,9 +63,12 @@ use super::CompactionError;
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
 /// A scheduled compaction task.
-pub struct ScheduledCompactionTask {
+pub(crate) struct ScheduledCompactionTask {
     pub options: CompactOptions,
+    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
     pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
+    pub gc_block: Option<gc_block::Guard>,
 }
 
 pub struct GcCompactionJobDescription {
@@ -1768,8 +1771,7 @@ impl Timeline {
         let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
             compact_below_lsn
         } else {
-            let gc_info = self.gc_info.read().unwrap();
-            gc_info.cutoffs.select_min() // use the real gc cutoff
+            *self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
         };
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
@@ -1962,7 +1964,11 @@ impl Timeline {
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
             let gc_cutoff = {
-                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
+                // Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
+                // cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
+                // to get the truth data.
+                let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
                 // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
                 // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
                 // the real cutoff.
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 881503046c..810a9723e0 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -121,9 +121,6 @@ page_cache_size=10
     assert vectored_average < 8
 
 
-@pytest.mark.skip(
-    "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
-)
 @skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     SMOKE_CONF = {
@@ -156,20 +153,20 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
-        ps_http.timeline_compact(
-            tenant_id,
-            timeline_id,
-            enhanced_gc_bottom_most_compaction=True,
-            body={
-                "scheduled": True,
-                "sub_compaction": True,
-                "compact_range": {
-                    "start": "000000000000000000000000000000000000",
-                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
-                    "end": "010000000000000000000000000000000000",
+            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
+            ps_http.timeline_compact(
+                tenant_id,
+                timeline_id,
+                enhanced_gc_bottom_most_compaction=True,
+                body={
+                    "scheduled": True,
+                    "sub_compaction": True,
+                    "compact_range": {
+                        "start": "000000000000000000000000000000000000",
+                        "end": "030000000000000000000000000000000000",
+                    },
                 },
-            },
-        )
+            )
 
         workload.churn_rows(row_count, env.pageserver.id)
 
@@ -181,6 +178,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
 
+    # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
+    ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
+    ps_http.timeline_gc(tenant_id, timeline_id, None)
+
 
 # Stripe sizes in number of pages.
 TINY_STRIPES = 16

From b593e51eaea6ec53af22503332c49488ff9bb055 Mon Sep 17 00:00:00 2001
From: Evan Fleming <evan.gordon.fleming@gmail.com>
Date: Mon, 9 Dec 2024 13:09:20 -0800
Subject: [PATCH 091/117] safekeeper: use arc for global timelines and config
 (#10051)

Hello! I was interested in potentially making some contributions to Neon
and looking through the issue backlog I found
[8200](https://github.com/neondatabase/neon/issues/8200) which seemed
like a good first issue to attempt to tackle. I see it was assigned a
while ago so apologies if I'm stepping on any toes with this PR. I also
apologize for the size of this PR. I'm not sure if there is a simple way
to reduce it given the footprint of the components being changed.

## Problem
This PR is attempting to address part of the problem outlined in issue
[8200](https://github.com/neondatabase/neon/issues/8200). Namely to
remove global static usage of timeline state in favour of
`Arc<GlobalTimelines>` and to replace wasteful clones of
`SafeKeeperConf` with `Arc<SafeKeeperConf>`. I did not opt to tackle
`RemoteStorage` in this PR to minimize the amount of changes as this PR
is already quite large. I also did not opt to introduce an
`SafekeeperApp` wrapper struct to similarly minimize changes but I can
tackle either or both of these omissions in this PR if folks would like.

## Summary of changes
- Remove static usage of `GlobalTimelines` in favour of
`Arc<GlobalTimelines>`
- Wrap `SafeKeeperConf` in `Arc` to avoid wasteful clones of the
underlying struct

## Some additional thoughts
- We seem to currently store `SafeKeeperConf` in `GlobalTimelines` and
then expose it through a public`get_global_config` function which
requires locking. This seems needlessly wasteful and based on observed
usage we could remove this public accessor and force consumers to
acquire `SafeKeeperConf` through the new Arc reference.
---
 safekeeper/benches/benchutils.rs       |  10 +-
 safekeeper/src/bin/safekeeper.rs       |  36 ++++---
 safekeeper/src/broker.rs               |  37 ++++---
 safekeeper/src/copy_timeline.rs        |  26 ++---
 safekeeper/src/debug_dump.rs           |  18 ++--
 safekeeper/src/handler.rs              |  14 ++-
 safekeeper/src/http/mod.rs             |   8 +-
 safekeeper/src/http/routes.rs          |  74 +++++++++-----
 safekeeper/src/json_ctrl.rs            |  30 +++---
 safekeeper/src/metrics.rs              |  19 ++--
 safekeeper/src/pull_timeline.rs        |  12 ++-
 safekeeper/src/receive_wal.rs          |  11 ++-
 safekeeper/src/send_wal.rs             |   6 +-
 safekeeper/src/timeline.rs             |  25 +++--
 safekeeper/src/timelines_global_map.rs | 128 ++++++++++++-------------
 safekeeper/src/wal_service.rs          |  22 +++--
 16 files changed, 283 insertions(+), 193 deletions(-)

diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/benches/benchutils.rs
index 4e8dc58c49..48d796221b 100644
--- a/safekeeper/benches/benchutils.rs
+++ b/safekeeper/benches/benchutils.rs
@@ -83,14 +83,20 @@ impl Env {
         node_id: NodeId,
         ttid: TenantTimelineId,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let conf = self.make_conf(node_id);
+        let conf = Arc::new(self.make_conf(node_id));
         let timeline_dir = get_timeline_dir(&conf, &ttid);
         let remote_path = remote_timeline_path(&ttid)?;
 
         let safekeeper = self.make_safekeeper(node_id, ttid).await?;
         let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
 
-        let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
+        let timeline = Timeline::new(
+            ttid,
+            &timeline_dir,
+            &remote_path,
+            shared_state,
+            conf.clone(),
+        );
         timeline.bootstrap(
             &mut timeline.write_shared_state().await,
             &conf,
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 4dc7edef37..13f6e34575 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -338,7 +338,7 @@ async fn main() -> anyhow::Result<()> {
         }
     };
 
-    let conf = SafeKeeperConf {
+    let conf = Arc::new(SafeKeeperConf {
         workdir,
         my_id: id,
         listen_pg_addr: args.listen_pg,
@@ -368,7 +368,7 @@ async fn main() -> anyhow::Result<()> {
         control_file_save_interval: args.control_file_save_interval,
         partial_backup_concurrency: args.partial_backup_concurrency,
         eviction_min_resident: args.eviction_min_resident,
-    };
+    });
 
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(
@@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> {
 /// complete, e.g. panicked, inner is error produced by task itself.
 type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
 
-async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
+async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
     // fsync the datadir to make sure we have a consistent state on disk.
     if !conf.no_sync {
         let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
@@ -428,9 +428,11 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         e
     })?;
 
+    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
+
     // Register metrics collector for active timelines. It's important to do this
     // after daemonizing, otherwise process collector will be upset.
-    let timeline_collector = safekeeper::metrics::TimelineCollector::new();
+    let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
     metrics::register_internal(Box::new(timeline_collector))?;
 
     wal_backup::init_remote_storage(&conf).await;
@@ -447,9 +449,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .then(|| Handle::try_current().expect("no runtime in main"));
 
     // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone()).await?;
+    global_timelines.init().await?;
 
-    let conf_ = conf.clone();
     // Run everything in current thread rt, if asked.
     if conf.current_thread_runtime {
         info!("running in current thread runtime");
@@ -459,14 +460,16 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .as_ref()
         .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
         .spawn(wal_service::task_main(
-            conf_,
+            conf.clone(),
             pg_listener,
             Scope::SafekeeperData,
+            global_timelines.clone(),
         ))
         // wrap with task name for error reporting
         .map(|res| ("WAL service main".to_owned(), res));
     tasks_handles.push(Box::pin(wal_service_handle));
 
+    let global_timelines_ = global_timelines.clone();
     let timeline_housekeeping_handle = current_thread_rt
         .as_ref()
         .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
@@ -474,40 +477,45 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
             const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
             loop {
                 tokio::time::sleep(TOMBSTONE_TTL).await;
-                GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
+                global_timelines_.housekeeping(&TOMBSTONE_TTL);
             }
         })
         .map(|res| ("Timeline map housekeeping".to_owned(), res));
     tasks_handles.push(Box::pin(timeline_housekeeping_handle));
 
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
-        let conf_ = conf.clone();
         let wal_service_handle = current_thread_rt
             .as_ref()
             .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
             .spawn(wal_service::task_main(
-                conf_,
+                conf.clone(),
                 pg_listener_tenant_only,
                 Scope::Tenant,
+                global_timelines.clone(),
             ))
             // wrap with task name for error reporting
             .map(|res| ("WAL service tenant only main".to_owned(), res));
         tasks_handles.push(Box::pin(wal_service_handle));
     }
 
-    let conf_ = conf.clone();
     let http_handle = current_thread_rt
         .as_ref()
         .unwrap_or_else(|| HTTP_RUNTIME.handle())
-        .spawn(http::task_main(conf_, http_listener))
+        .spawn(http::task_main(
+            conf.clone(),
+            http_listener,
+            global_timelines.clone(),
+        ))
         .map(|res| ("HTTP service main".to_owned(), res));
     tasks_handles.push(Box::pin(http_handle));
 
-    let conf_ = conf.clone();
     let broker_task_handle = current_thread_rt
         .as_ref()
         .unwrap_or_else(|| BROKER_RUNTIME.handle())
-        .spawn(broker::task_main(conf_).instrument(info_span!("broker")))
+        .spawn(
+            broker::task_main(conf.clone(), global_timelines.clone())
+                .instrument(info_span!("broker")),
+        )
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 485816408f..4b091e2c29 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -39,14 +39,17 @@ const RETRY_INTERVAL_MSEC: u64 = 1000;
 const PUSH_INTERVAL_MSEC: u64 = 1000;
 
 /// Push once in a while data about all active timelines to the broker.
-async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
+async fn push_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> anyhow::Result<()> {
     if conf.disable_periodic_broker_push {
         info!("broker push_loop is disabled, doing nothing...");
         futures::future::pending::<()>().await; // sleep forever
         return Ok(());
     }
 
-    let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
+    let active_timelines_set = global_timelines.get_global_broker_active_set();
 
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
@@ -87,8 +90,13 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
 
 /// Subscribe and fetch all the interesting data from the broker.
 #[instrument(name = "broker_pull", skip_all)]
-async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
-    let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
+async fn pull_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+    stats: Arc<BrokerStats>,
+) -> Result<()> {
+    let mut client =
+        storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
 
     // TODO: subscribe only to local timelines instead of all
     let request = SubscribeSafekeeperInfoRequest {
@@ -113,7 +121,7 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
             .as_ref()
             .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
         let ttid = parse_proto_ttid(proto_ttid)?;
-        if let Ok(tli) = GlobalTimelines::get(ttid) {
+        if let Ok(tli) = global_timelines.get(ttid) {
             // Note that we also receive *our own* info. That's
             // important, as it is used as an indication of live
             // connection to the broker.
@@ -135,7 +143,11 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
 
 /// Process incoming discover requests. This is done in a separate task to avoid
 /// interfering with the normal pull/push loops.
-async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
+async fn discover_loop(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+    stats: Arc<BrokerStats>,
+) -> Result<()> {
     let mut client =
         storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
 
@@ -171,7 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
                     .as_ref()
                     .ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
                 let ttid = parse_proto_ttid(proto_ttid)?;
-                if let Ok(tli) = GlobalTimelines::get(ttid) {
+                if let Ok(tli) = global_timelines.get(ttid) {
                     // we received a discovery request for a timeline we know about
                     discover_counter.inc();
 
@@ -210,7 +222,10 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
     bail!("end of stream");
 }
 
-pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
+pub async fn task_main(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> anyhow::Result<()> {
     info!("started, broker endpoint {:?}", conf.broker_endpoint);
 
     let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
@@ -261,13 +276,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
                 },
                 _ = ticker.tick() => {
                     if push_handle.is_none() {
-                        push_handle = Some(tokio::spawn(push_loop(conf.clone())));
+                        push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone())));
                     }
                     if pull_handle.is_none() {
-                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
+                        pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone())));
                     }
                     if discover_handle.is_none() {
-                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
+                        discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone())));
                     }
                 },
                 _ = &mut stats_task => {}
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 07fa98212f..28ef2b1d23 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -1,9 +1,7 @@
-use std::sync::Arc;
-
 use anyhow::{bail, Result};
 use camino::Utf8PathBuf;
-
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use std::sync::Arc;
 use tokio::{
     fs::OpenOptions,
     io::{AsyncSeekExt, AsyncWriteExt},
@@ -14,7 +12,7 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
 use crate::{
     control_file::FileStorage,
     state::TimelinePersistentState,
-    timeline::{Timeline, TimelineError, WalResidentTimeline},
+    timeline::{TimelineError, WalResidentTimeline},
     timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
@@ -25,16 +23,19 @@ use crate::{
 const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
 
 pub struct Request {
-    pub source: Arc<Timeline>,
+    pub source_ttid: TenantTimelineId,
     pub until_lsn: Lsn,
     pub destination_ttid: TenantTimelineId,
 }
 
-pub async fn handle_request(request: Request) -> Result<()> {
+pub async fn handle_request(
+    request: Request,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
     // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
     //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
 
-    match GlobalTimelines::get(request.destination_ttid) {
+    match global_timelines.get(request.destination_ttid) {
         // timeline already exists. would be good to check that this timeline is the copy
         // of the source timeline, but it isn't obvious how to do that
         Ok(_) => return Ok(()),
@@ -46,9 +47,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
         }
     }
 
-    let source_tli = request.source.wal_residence_guard().await?;
+    let source = global_timelines.get(request.source_ttid)?;
+    let source_tli = source.wal_residence_guard().await?;
 
-    let conf = &GlobalTimelines::get_global_config();
+    let conf = &global_timelines.get_global_config();
     let ttid = request.destination_ttid;
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
@@ -127,7 +129,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
 
     copy_s3_segments(
         wal_seg_size,
-        &request.source.ttid,
+        &request.source_ttid,
         &request.destination_ttid,
         first_segment,
         first_ondisk_segment,
@@ -158,7 +160,9 @@ pub async fn handle_request(request: Request) -> Result<()> {
 
     // now we have a ready timeline in a temp directory
     validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
-    GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
+    global_timelines
+        .load_temp_timeline(request.destination_ttid, &tli_dir_path, true)
+        .await?;
 
     Ok(())
 }
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index a2d0c49768..93011eddec 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -207,23 +207,23 @@ pub struct FileInfo {
 }
 
 /// Build debug dump response, using the provided [`Args`] filters.
-pub async fn build(args: Args) -> Result<Response> {
+pub async fn build(args: Args, global_timelines: Arc<GlobalTimelines>) -> Result<Response> {
     let start_time = Utc::now();
-    let timelines_count = GlobalTimelines::timelines_count();
-    let config = GlobalTimelines::get_global_config();
+    let timelines_count = global_timelines.timelines_count();
+    let config = global_timelines.get_global_config();
 
     let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
         // If both tenant_id and timeline_id are specified, we can just get the
         // timeline directly, without taking a snapshot of the whole list.
         let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
-        if let Ok(tli) = GlobalTimelines::get(ttid) {
+        if let Ok(tli) = global_timelines.get(ttid) {
             vec![tli]
         } else {
             vec![]
         }
     } else {
         // Otherwise, take a snapshot of the whole list.
-        GlobalTimelines::get_all()
+        global_timelines.get_all()
     };
 
     let mut timelines = Vec::new();
@@ -344,12 +344,12 @@ fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
 
 /// Converts SafeKeeperConf to Config, filtering out the fields that are not
 /// supposed to be exposed.
-fn build_config(config: SafeKeeperConf) -> Config {
+fn build_config(config: Arc<SafeKeeperConf>) -> Config {
     Config {
         id: config.my_id,
-        workdir: config.workdir.into(),
-        listen_pg_addr: config.listen_pg_addr,
-        listen_http_addr: config.listen_http_addr,
+        workdir: config.workdir.clone().into(),
+        listen_pg_addr: config.listen_pg_addr.clone(),
+        listen_http_addr: config.listen_http_addr.clone(),
         no_sync: config.no_sync,
         max_offloader_lag_bytes: config.max_offloader_lag_bytes,
         wal_backup_enabled: config.wal_backup_enabled,
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 8dd2929a03..2ca6333ba8 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -33,7 +33,7 @@ use utils::{
 
 /// Safekeeper handler of postgres commands
 pub struct SafekeeperPostgresHandler {
-    pub conf: SafeKeeperConf,
+    pub conf: Arc<SafeKeeperConf>,
     /// assigned application name
     pub appname: Option<String>,
     pub tenant_id: Option<TenantId>,
@@ -43,6 +43,7 @@ pub struct SafekeeperPostgresHandler {
     pub protocol: Option<PostgresClientProtocol>,
     /// Unique connection id is logged in spans for observability.
     pub conn_id: ConnectionId,
+    pub global_timelines: Arc<GlobalTimelines>,
     /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
     auth: Option<(Scope, Arc<JwtAuth>)>,
     claims: Option<Claims>,
@@ -314,10 +315,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
 
 impl SafekeeperPostgresHandler {
     pub fn new(
-        conf: SafeKeeperConf,
+        conf: Arc<SafeKeeperConf>,
         conn_id: u32,
         io_metrics: Option<TrafficMetrics>,
         auth: Option<(Scope, Arc<JwtAuth>)>,
+        global_timelines: Arc<GlobalTimelines>,
     ) -> Self {
         SafekeeperPostgresHandler {
             conf,
@@ -331,6 +333,7 @@ impl SafekeeperPostgresHandler {
             claims: None,
             auth,
             io_metrics,
+            global_timelines,
         }
     }
 
@@ -360,7 +363,7 @@ impl SafekeeperPostgresHandler {
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
         // Get timeline, handling "not found" error
-        let tli = match GlobalTimelines::get(self.ttid) {
+        let tli = match self.global_timelines.get(self.ttid) {
             Ok(tli) => Ok(Some(tli)),
             Err(TimelineError::NotFound(_)) => Ok(None),
             Err(e) => Err(QueryError::Other(e.into())),
@@ -394,7 +397,10 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
     ) -> Result<(), QueryError> {
-        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let tli = self
+            .global_timelines
+            .get(self.ttid)
+            .map_err(|e| QueryError::Other(e.into()))?;
 
         let lsn = if self.is_walproposer_recovery() {
             // walproposer should get all local WAL until flush_lsn
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 52fb13ff5b..7229ccb739 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -3,14 +3,16 @@ pub mod routes;
 pub use routes::make_router;
 
 pub use safekeeper_api::models;
+use std::sync::Arc;
 
-use crate::SafeKeeperConf;
+use crate::{GlobalTimelines, SafeKeeperConf};
 
 pub async fn task_main(
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
     http_listener: std::net::TcpListener,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
-    let router = make_router(conf)
+    let router = make_router(conf, global_timelines)
         .build()
         .map_err(|err| anyhow::anyhow!(err))?;
     let service = utils::http::RouterService::new(router).unwrap();
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 69b775fd76..71c36f1d46 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -66,6 +66,13 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
         .as_ref()
 }
 
+fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
+    request
+        .data::<Arc<GlobalTimelines>>()
+        .expect("unknown state type")
+        .clone()
+}
+
 /// Same as TermLsn, but serializes LSN using display serializer
 /// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
@@ -123,9 +130,11 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
     let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
     check_permission(&request, Some(tenant_id))?;
     ensure_no_body(&mut request).await?;
+    let global_timelines = get_global_timelines(&request);
     // FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
     // Using an `InternalServerError` should be fixed when the types support it
-    let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
+    let delete_info = global_timelines
+        .delete_force_all_for_tenant(&tenant_id, only_local)
         .await
         .map_err(ApiError::InternalServerError)?;
     json_response(
@@ -156,7 +165,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             .commit_lsn
             .segment_lsn(server_info.wal_seg_size as usize)
     });
-    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+    let global_timelines = get_global_timelines(&request);
+    global_timelines
+        .create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -167,7 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
-    let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
+    let global_timelines = get_global_timelines(&request);
+    let res: Vec<TenantTimelineId> = global_timelines
+        .get_all()
         .iter()
         .map(|tli| tli.ttid)
         .collect();
@@ -182,7 +195,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     );
     check_permission(&request, Some(ttid.tenant_id))?;
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
     let (inmem, state) = tli.get_state().await;
     let flush_lsn = tli.get_flush_lsn().await;
 
@@ -233,9 +247,11 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
     let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
     check_permission(&request, Some(ttid.tenant_id))?;
     ensure_no_body(&mut request).await?;
+    let global_timelines = get_global_timelines(&request);
     // FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
     // error handling here when we're able to.
-    let resp = GlobalTimelines::delete(&ttid, only_local)
+    let resp = global_timelines
+        .delete(&ttid, only_local)
         .await
         .map_err(ApiError::InternalServerError)?;
     json_response(StatusCode::OK, resp)
@@ -247,8 +263,9 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
 
     let data: pull_timeline::Request = json_request(&mut request).await?;
     let conf = get_conf(&request);
+    let global_timelines = get_global_timelines(&request);
 
-    let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
+    let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone(), global_timelines)
         .await
         .map_err(ApiError::InternalServerError)?;
     json_response(StatusCode::OK, resp)
@@ -263,7 +280,8 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     );
     check_permission(&request, Some(ttid.tenant_id))?;
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
 
     // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
     // so create the chan and write to it in another task.
@@ -293,19 +311,19 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
     check_permission(&request, None)?;
 
     let request_data: TimelineCopyRequest = json_request(&mut request).await?;
-    let ttid = TenantTimelineId::new(
+    let source_ttid = TenantTimelineId::new(
         parse_request_param(&request, "tenant_id")?,
         parse_request_param(&request, "source_timeline_id")?,
     );
 
-    let source = GlobalTimelines::get(ttid)?;
+    let global_timelines = get_global_timelines(&request);
 
     copy_timeline::handle_request(copy_timeline::Request{
-        source,
+        source_ttid,
         until_lsn: request_data.until_lsn,
-        destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
-    })
-        .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
+        destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
+    }, global_timelines)
+        .instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -322,7 +340,8 @@ async fn patch_control_file_handler(
         parse_request_param(&request, "timeline_id")?,
     );
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
 
     let patch_request: patch_control_file::Request = json_request(&mut request).await?;
     let response = patch_control_file::handle_request(tli, patch_request)
@@ -341,7 +360,8 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         parse_request_param(&request, "timeline_id")?,
     );
 
-    let tli = GlobalTimelines::get(ttid)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid)?;
     tli.write_shared_state()
         .await
         .sk
@@ -359,6 +379,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
     );
     check_permission(&request, Some(ttid.tenant_id))?;
 
+    let global_timelines = get_global_timelines(&request);
     let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
     let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
 
@@ -371,7 +392,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
         )))?,
     };
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
     let tli = tli
         .wal_residence_guard()
         .await
@@ -393,7 +414,8 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
     );
     check_permission(&request, Some(ttid.tenant_id))?;
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
 
     let response = tli
         .backup_partial_reset()
@@ -415,7 +437,8 @@ async fn timeline_term_bump_handler(
 
     let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
     let response = tli
         .term_bump(request_data.term)
         .await
@@ -452,7 +475,8 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
         standby_horizon: sk_info.standby_horizon.0,
     };
 
-    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
     tli.record_safekeeper_info(proto_sk_info)
         .await
         .map_err(ApiError::InternalServerError)?;
@@ -506,6 +530,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
     let dump_term_history = dump_term_history.unwrap_or(true);
     let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
 
+    let global_timelines = get_global_timelines(&request);
+
     let args = debug_dump::Args {
         dump_all,
         dump_control_file,
@@ -517,7 +543,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
         timeline_id,
     };
 
-    let resp = debug_dump::build(args)
+    let resp = debug_dump::build(args, global_timelines)
         .await
         .map_err(ApiError::InternalServerError)?;
 
@@ -570,7 +596,10 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 }
 
 /// Safekeeper http router.
-pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
+pub fn make_router(
+    conf: Arc<SafeKeeperConf>,
+    global_timelines: Arc<GlobalTimelines>,
+) -> RouterBuilder<hyper::Body, ApiError> {
     let mut router = endpoint::make_router();
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
@@ -592,7 +621,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
     let auth = conf.http_auth.clone();
     router
-        .data(Arc::new(conf))
+        .data(conf)
+        .data(global_timelines)
         .data(auth)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 0573ea81e7..dc4ad3706e 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -11,7 +11,6 @@ use postgres_backend::QueryError;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::*;
-use utils::id::TenantTimelineId;
 
 use crate::handler::SafekeeperPostgresHandler;
 use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
@@ -21,7 +20,6 @@ use crate::safekeeper::{
 use crate::safekeeper::{Term, TermHistory, TermLsn};
 use crate::state::TimelinePersistentState;
 use crate::timeline::WalResidentTimeline;
-use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
 use postgres_ffi::encode_logical_message;
 use postgres_ffi::WAL_SEGMENT_SIZE;
@@ -70,7 +68,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
     info!("JSON_CTRL request: {append_request:?}");
 
     // need to init safekeeper state before AppendRequest
-    let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
+    let tli = prepare_safekeeper(spg, append_request.pg_version).await?;
 
     // if send_proposer_elected is true, we need to update local history
     if append_request.send_proposer_elected {
@@ -99,20 +97,22 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
 /// Prepare safekeeper to process append requests without crashes,
 /// by sending ProposerGreeting with default server.wal_seg_size.
 async fn prepare_safekeeper(
-    ttid: TenantTimelineId,
+    spg: &SafekeeperPostgresHandler,
     pg_version: u32,
 ) -> anyhow::Result<WalResidentTimeline> {
-    let tli = GlobalTimelines::create(
-        ttid,
-        ServerInfo {
-            pg_version,
-            wal_seg_size: WAL_SEGMENT_SIZE as u32,
-            system_id: 0,
-        },
-        Lsn::INVALID,
-        Lsn::INVALID,
-    )
-    .await?;
+    let tli = spg
+        .global_timelines
+        .create(
+            spg.ttid,
+            ServerInfo {
+                pg_version,
+                wal_seg_size: WAL_SEGMENT_SIZE as u32,
+                system_id: 0,
+            },
+            Lsn::INVALID,
+            Lsn::INVALID,
+        )
+        .await?;
 
     tli.wal_residence_guard().await
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index bbd2f86898..5883f402c7 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -455,6 +455,7 @@ pub struct FullTimelineInfo {
 
 /// Collects metrics for all active timelines.
 pub struct TimelineCollector {
+    global_timelines: Arc<GlobalTimelines>,
     descs: Vec<Desc>,
     commit_lsn: GenericGaugeVec<AtomicU64>,
     backup_lsn: GenericGaugeVec<AtomicU64>,
@@ -478,14 +479,8 @@ pub struct TimelineCollector {
     active_timelines_count: IntGauge,
 }
 
-impl Default for TimelineCollector {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl TimelineCollector {
-    pub fn new() -> TimelineCollector {
+    pub fn new(global_timelines: Arc<GlobalTimelines>) -> TimelineCollector {
         let mut descs = Vec::new();
 
         let commit_lsn = GenericGaugeVec::new(
@@ -676,6 +671,7 @@ impl TimelineCollector {
         descs.extend(active_timelines_count.desc().into_iter().cloned());
 
         TimelineCollector {
+            global_timelines,
             descs,
             commit_lsn,
             backup_lsn,
@@ -728,17 +724,18 @@ impl Collector for TimelineCollector {
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
-        let timelines_count = GlobalTimelines::get_all().len();
+        let timelines_count = self.global_timelines.get_all().len();
         let mut active_timelines_count = 0;
 
         // Prometheus Collector is sync, and data is stored under async lock. To
         // bridge the gap with a crutch, collect data in spawned thread with
         // local tokio runtime.
+        let global_timelines = self.global_timelines.clone();
         let infos = std::thread::spawn(|| {
             let rt = tokio::runtime::Builder::new_current_thread()
                 .build()
                 .expect("failed to create rt");
-            rt.block_on(collect_timeline_metrics())
+            rt.block_on(collect_timeline_metrics(global_timelines))
         })
         .join()
         .expect("collect_timeline_metrics thread panicked");
@@ -857,9 +854,9 @@ impl Collector for TimelineCollector {
     }
 }
 
-async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
+async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec<FullTimelineInfo> {
     let mut res = vec![];
-    let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
+    let active_timelines = global_timelines.get_global_broker_active_set().get_all();
 
     for tli in active_timelines {
         if let Some(info) = tli.info_for_metrics().await {
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c700e18cc7..f58a9dca1d 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -409,8 +409,9 @@ pub struct DebugDumpResponse {
 pub async fn handle_request(
     request: Request,
     sk_auth_token: Option<SecretString>,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<Response> {
-    let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
+    let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
         request.timeline_id,
     ));
@@ -453,13 +454,14 @@ pub async fn handle_request(
     assert!(status.tenant_id == request.tenant_id);
     assert!(status.timeline_id == request.timeline_id);
 
-    pull_timeline(status, safekeeper_host, sk_auth_token).await
+    pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await
 }
 
 async fn pull_timeline(
     status: TimelineStatus,
     host: String,
     sk_auth_token: Option<SecretString>,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<Response> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
@@ -472,7 +474,7 @@ async fn pull_timeline(
         status.acceptor_state.epoch
     );
 
-    let conf = &GlobalTimelines::get_global_config();
+    let conf = &global_timelines.get_global_config();
 
     let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
@@ -531,7 +533,9 @@ async fn pull_timeline(
     assert!(status.commit_lsn <= status.flush_lsn);
 
     // Finally, load the timeline.
-    let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
+    let _tli = global_timelines
+        .load_temp_timeline(ttid, &tli_dir_path, false)
+        .await?;
 
     Ok(Response {
         safekeeper_host: host,
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index bfa1764abf..2a49890d61 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -267,6 +267,7 @@ impl SafekeeperPostgresHandler {
             pgb_reader: &mut pgb_reader,
             peer_addr,
             acceptor_handle: &mut acceptor_handle,
+            global_timelines: self.global_timelines.clone(),
         };
 
         // Read first message and create timeline if needed.
@@ -331,6 +332,7 @@ struct NetworkReader<'a, IO> {
     // WalAcceptor is spawned when we learn server info from walproposer and
     // create timeline; handle is put here.
     acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
+    global_timelines: Arc<GlobalTimelines>,
 }
 
 impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
@@ -350,10 +352,11 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                let tli =
-                    GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
-                        .await
-                        .context("create timeline")?;
+                let tli = self
+                    .global_timelines
+                    .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                    .await
+                    .context("create timeline")?;
                 tli.wal_residence_guard().await?
             }
             _ => {
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 225b7f4c05..0887cf7264 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -10,7 +10,6 @@ use crate::timeline::WalResidentTimeline;
 use crate::wal_reader_stream::WalReaderStreamBuilder;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
-use crate::GlobalTimelines;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
 use futures::future::Either;
@@ -400,7 +399,10 @@ impl SafekeeperPostgresHandler {
         start_pos: Lsn,
         term: Option<Term>,
     ) -> Result<(), QueryError> {
-        let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
+        let tli = self
+            .global_timelines
+            .get(self.ttid)
+            .map_err(|e| QueryError::Other(e.into()))?;
         let residence_guard = tli.wal_residence_guard().await?;
 
         if let Err(end) = self
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index ef928f7633..94d6ef1061 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -44,8 +44,8 @@ use crate::wal_backup_partial::PartialRemoteSegment;
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
+use crate::SafeKeeperConf;
 use crate::{debug_dump, timeline_manager, wal_storage};
-use crate::{GlobalTimelines, SafeKeeperConf};
 
 /// Things safekeeper should know about timeline state on peers.
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -467,6 +467,7 @@ pub struct Timeline {
     walreceivers: Arc<WalReceivers>,
     timeline_dir: Utf8PathBuf,
     manager_ctl: ManagerCtl,
+    conf: Arc<SafeKeeperConf>,
 
     /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
     /// this gate, you must respect [`Timeline::cancel`]
@@ -489,6 +490,7 @@ impl Timeline {
         timeline_dir: &Utf8Path,
         remote_path: &RemotePath,
         shared_state: SharedState,
+        conf: Arc<SafeKeeperConf>,
     ) -> Arc<Self> {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
             watch::channel(shared_state.sk.state().commit_lsn);
@@ -516,6 +518,7 @@ impl Timeline {
             gate: Default::default(),
             cancel: CancellationToken::default(),
             manager_ctl: ManagerCtl::new(),
+            conf,
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
@@ -524,11 +527,14 @@ impl Timeline {
     }
 
     /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub fn load_timeline(
+        conf: Arc<SafeKeeperConf>,
+        ttid: TenantTimelineId,
+    ) -> Result<Arc<Timeline>> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
-        let shared_state = SharedState::restore(conf, &ttid)?;
-        let timeline_dir = get_timeline_dir(conf, &ttid);
+        let shared_state = SharedState::restore(conf.as_ref(), &ttid)?;
+        let timeline_dir = get_timeline_dir(conf.as_ref(), &ttid);
         let remote_path = remote_timeline_path(&ttid)?;
 
         Ok(Timeline::new(
@@ -536,6 +542,7 @@ impl Timeline {
             &timeline_dir,
             &remote_path,
             shared_state,
+            conf,
         ))
     }
 
@@ -604,8 +611,7 @@ impl Timeline {
         // it is cancelled, so WAL storage won't be opened again.
         shared_state.sk.close_wal_store();
 
-        let conf = GlobalTimelines::get_global_config();
-        if !only_local && conf.is_wal_backup_enabled() {
+        if !only_local && self.conf.is_wal_backup_enabled() {
             // Note: we concurrently delete remote storage data from multiple
             // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
             // do some retries anyway.
@@ -951,7 +957,7 @@ impl WalResidentTimeline {
 
     pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
         let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
+        let enable_remote_read = self.conf.is_wal_backup_enabled();
 
         WalReader::new(
             &self.ttid,
@@ -1061,7 +1067,6 @@ impl ManagerTimeline {
 
     /// Try to switch state Offloaded->Present.
     pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
-        let conf = GlobalTimelines::get_global_config();
         let mut shared = self.write_shared_state().await;
 
         // trying to restore WAL storage
@@ -1069,7 +1074,7 @@ impl ManagerTimeline {
             &self.ttid,
             &self.timeline_dir,
             shared.sk.state(),
-            conf.no_sync,
+            self.conf.no_sync,
         )?;
 
         // updating control file
@@ -1096,7 +1101,7 @@ impl ManagerTimeline {
         // now we can switch shared.sk to Present, shouldn't fail
         let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
         let cfile_state = prev_sk.take_state();
-        shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
+        shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, self.conf.my_id)?);
 
         Ok(())
     }
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 067945fd5f..e1241ceb9b 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -13,7 +13,6 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
-use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
 use std::str::FromStr;
@@ -42,23 +41,16 @@ struct GlobalTimelinesState {
     // this map is dropped on restart.
     tombstones: HashMap<TenantTimelineId, Instant>,
 
-    conf: Option<SafeKeeperConf>,
+    conf: Arc<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     global_rate_limiter: RateLimiter,
 }
 
 impl GlobalTimelinesState {
-    /// Get configuration, which must be set once during init.
-    fn get_conf(&self) -> &SafeKeeperConf {
-        self.conf
-            .as_ref()
-            .expect("GlobalTimelinesState conf is not initialized")
-    }
-
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
+    fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
         (
-            self.get_conf().clone(),
+            self.conf.clone(),
             self.broker_active_set.clone(),
             self.global_rate_limiter.clone(),
         )
@@ -82,35 +74,39 @@ impl GlobalTimelinesState {
     }
 }
 
-static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
-    Mutex::new(GlobalTimelinesState {
-        timelines: HashMap::new(),
-        tombstones: HashMap::new(),
-        conf: None,
-        broker_active_set: Arc::new(TimelinesSet::default()),
-        global_rate_limiter: RateLimiter::new(1, 1),
-    })
-});
-
-/// A zero-sized struct used to manage access to the global timelines map.
-pub struct GlobalTimelines;
+/// A struct used to manage access to the global timelines map.
+pub struct GlobalTimelines {
+    state: Mutex<GlobalTimelinesState>,
+}
 
 impl GlobalTimelines {
+    /// Create a new instance of the global timelines map.
+    pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
+        Self {
+            state: Mutex::new(GlobalTimelinesState {
+                timelines: HashMap::new(),
+                tombstones: HashMap::new(),
+                conf,
+                broker_active_set: Arc::new(TimelinesSet::default()),
+                global_rate_limiter: RateLimiter::new(1, 1),
+            }),
+        }
+    }
+
     /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(conf: SafeKeeperConf) -> Result<()> {
+    pub async fn init(&self) -> Result<()> {
         // clippy isn't smart enough to understand that drop(state) releases the
         // lock, so use explicit block
         let tenants_dir = {
-            let mut state = TIMELINES_STATE.lock().unwrap();
+            let mut state = self.state.lock().unwrap();
             state.global_rate_limiter = RateLimiter::new(
-                conf.partial_backup_concurrency,
+                state.conf.partial_backup_concurrency,
                 DEFAULT_EVICTION_CONCURRENCY,
             );
-            state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
             // named as a valid tenant_id.
-            state.get_conf().workdir.clone()
+            state.conf.workdir.clone()
         };
         let mut tenant_count = 0;
         for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
@@ -122,7 +118,7 @@ impl GlobalTimelines {
                         TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         tenant_count += 1;
-                        GlobalTimelines::load_tenant_timelines(tenant_id).await?;
+                        self.load_tenant_timelines(tenant_id).await?;
                     }
                 }
                 Err(e) => error!(
@@ -135,7 +131,7 @@ impl GlobalTimelines {
         info!(
             "found {} tenants directories, successfully loaded {} timelines",
             tenant_count,
-            TIMELINES_STATE.lock().unwrap().timelines.len()
+            self.state.lock().unwrap().timelines.len()
         );
         Ok(())
     }
@@ -143,13 +139,13 @@ impl GlobalTimelines {
     /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
     /// errors if any.
     ///
-    /// It is async, but TIMELINES_STATE lock is sync and there is no important
+    /// It is async, but self.state lock is sync and there is no important
     /// reason to make it async (it is always held for a short while), so we
     /// just lock and unlock it for each timeline -- this function is called
     /// during init when nothing else is running, so this is fine.
-    async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
+    async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
         let (conf, broker_active_set, partial_backup_rate_limiter) = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
             state.get_dependencies()
         };
 
@@ -163,10 +159,10 @@ impl GlobalTimelines {
                         TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid) {
+                        match Timeline::load_timeline(conf.clone(), ttid) {
                             Ok(tli) => {
                                 let mut shared_state = tli.write_shared_state().await;
-                                TIMELINES_STATE
+                                self.state
                                     .lock()
                                     .unwrap()
                                     .timelines
@@ -200,29 +196,30 @@ impl GlobalTimelines {
     }
 
     /// Get the number of timelines in the map.
-    pub fn timelines_count() -> usize {
-        TIMELINES_STATE.lock().unwrap().timelines.len()
+    pub fn timelines_count(&self) -> usize {
+        self.state.lock().unwrap().timelines.len()
     }
 
     /// Get the global safekeeper config.
-    pub fn get_global_config() -> SafeKeeperConf {
-        TIMELINES_STATE.lock().unwrap().get_conf().clone()
+    pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
+        self.state.lock().unwrap().conf.clone()
     }
 
-    pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
-        TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
+    pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
+        self.state.lock().unwrap().broker_active_set.clone()
     }
 
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
     pub(crate) async fn create(
+        &self,
         ttid: TenantTimelineId,
         server_info: ServerInfo,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
         let (conf, _, _) = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
                 return Ok(timeline);
@@ -245,7 +242,7 @@ impl GlobalTimelines {
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
         control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
-        let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
+        let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
         Ok(timeline)
     }
 
@@ -261,13 +258,14 @@ impl GlobalTimelines {
     /// 2) move the directory and load the timeline
     /// 3) take lock again and insert the timeline into the global map.
     pub async fn load_temp_timeline(
+        &self,
         ttid: TenantTimelineId,
         tmp_path: &Utf8PathBuf,
         check_tombstone: bool,
     ) -> Result<Arc<Timeline>> {
         // Check for existence and mark that we're creating it.
         let (conf, broker_active_set, partial_backup_rate_limiter) = {
-            let mut state = TIMELINES_STATE.lock().unwrap();
+            let mut state = self.state.lock().unwrap();
             match state.timelines.get(&ttid) {
                 Some(GlobalMapTimeline::CreationInProgress) => {
                     bail!(TimelineError::CreationInProgress(ttid));
@@ -295,10 +293,10 @@ impl GlobalTimelines {
         };
 
         // Do the actual move and reflect the result in the map.
-        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
+        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
             Ok(timeline) => {
                 let mut timeline_shared_state = timeline.write_shared_state().await;
-                let mut state = TIMELINES_STATE.lock().unwrap();
+                let mut state = self.state.lock().unwrap();
                 assert!(matches!(
                     state.timelines.get(&ttid),
                     Some(GlobalMapTimeline::CreationInProgress)
@@ -319,7 +317,7 @@ impl GlobalTimelines {
             }
             Err(e) => {
                 // Init failed, remove the marker from the map
-                let mut state = TIMELINES_STATE.lock().unwrap();
+                let mut state = self.state.lock().unwrap();
                 assert!(matches!(
                     state.timelines.get(&ttid),
                     Some(GlobalMapTimeline::CreationInProgress)
@@ -334,10 +332,10 @@ impl GlobalTimelines {
     async fn install_temp_timeline(
         ttid: TenantTimelineId,
         tmp_path: &Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        conf: Arc<SafeKeeperConf>,
     ) -> Result<Arc<Timeline>> {
-        let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
-        let timeline_path = get_timeline_dir(conf, &ttid);
+        let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
+        let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
 
         // We must have already checked that timeline doesn't exist in the map,
         // but there might be existing datadir: if timeline is corrupted it is
@@ -382,9 +380,9 @@ impl GlobalTimelines {
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
     /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
     /// i.e. loaded in memory and not cancelled.
-    pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
+    pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
         let tli_res = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
             state.get(&ttid)
         };
         match tli_res {
@@ -399,8 +397,8 @@ impl GlobalTimelines {
     }
 
     /// Returns all timelines. This is used for background timeline processes.
-    pub fn get_all() -> Vec<Arc<Timeline>> {
-        let global_lock = TIMELINES_STATE.lock().unwrap();
+    pub fn get_all(&self) -> Vec<Arc<Timeline>> {
+        let global_lock = self.state.lock().unwrap();
         global_lock
             .timelines
             .values()
@@ -419,8 +417,8 @@ impl GlobalTimelines {
 
     /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
     /// and that's why it can return cancelled timelines, to retry deleting them.
-    fn get_all_for_tenant(tenant_id: TenantId) -> Vec<Arc<Timeline>> {
-        let global_lock = TIMELINES_STATE.lock().unwrap();
+    fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
+        let global_lock = self.state.lock().unwrap();
         global_lock
             .timelines
             .values()
@@ -435,11 +433,12 @@ impl GlobalTimelines {
     /// Cancels timeline, then deletes the corresponding data directory.
     /// If only_local, doesn't remove WAL segments in remote storage.
     pub(crate) async fn delete(
+        &self,
         ttid: &TenantTimelineId,
         only_local: bool,
     ) -> Result<TimelineDeleteForceResult> {
         let tli_res = {
-            let state = TIMELINES_STATE.lock().unwrap();
+            let state = self.state.lock().unwrap();
 
             if state.tombstones.contains_key(ttid) {
                 // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
@@ -472,7 +471,7 @@ impl GlobalTimelines {
             }
             Err(_) => {
                 // Timeline is not memory, but it may still exist on disk in broken state.
-                let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
+                let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
                 let dir_existed = delete_dir(dir_path)?;
 
                 Ok(TimelineDeleteForceResult {
@@ -485,7 +484,7 @@ impl GlobalTimelines {
         // Finalize deletion, by dropping Timeline objects and storing smaller tombstones.  The tombstones
         // are used to prevent still-running computes from re-creating the same timeline when they send data,
         // and to speed up repeated deletion calls by avoiding re-listing objects.
-        TIMELINES_STATE.lock().unwrap().delete(*ttid);
+        self.state.lock().unwrap().delete(*ttid);
 
         result
     }
@@ -497,17 +496,18 @@ impl GlobalTimelines {
     ///
     /// If only_local, doesn't remove WAL segments in remote storage.
     pub async fn delete_force_all_for_tenant(
+        &self,
         tenant_id: &TenantId,
         only_local: bool,
     ) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
         info!("deleting all timelines for tenant {}", tenant_id);
-        let to_delete = Self::get_all_for_tenant(*tenant_id);
+        let to_delete = self.get_all_for_tenant(*tenant_id);
 
         let mut err = None;
 
         let mut deleted = HashMap::new();
         for tli in &to_delete {
-            match Self::delete(&tli.ttid, only_local).await {
+            match self.delete(&tli.ttid, only_local).await {
                 Ok(result) => {
                     deleted.insert(tli.ttid, result);
                 }
@@ -529,15 +529,15 @@ impl GlobalTimelines {
         // so the directory may be not empty. In this case timelines will have bad state
         // and timeline background jobs can panic.
         delete_dir(get_tenant_dir(
-            TIMELINES_STATE.lock().unwrap().get_conf(),
+            self.state.lock().unwrap().conf.as_ref(),
             tenant_id,
         ))?;
 
         Ok(deleted)
     }
 
-    pub fn housekeeping(tombstone_ttl: &Duration) {
-        let mut state = TIMELINES_STATE.lock().unwrap();
+    pub fn housekeeping(&self, tombstone_ttl: &Duration) {
+        let mut state = self.state.lock().unwrap();
 
         // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
         // timelines.  If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 5248d545db..1ff83918a7 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,6 +4,7 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
+use std::sync::Arc;
 use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
@@ -11,9 +12,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
-use crate::handler::SafekeeperPostgresHandler;
 use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
+use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
 use postgres_backend::{AuthType, PostgresBackend};
 
 /// Accept incoming TCP connections and spawn them into a background thread.
@@ -22,9 +23,10 @@ use postgres_backend::{AuthType, PostgresBackend};
 /// to any tenant are allowed) or Tenant (only tokens giving access to specific
 /// tenant are allowed). Doesn't matter if auth is disabled in conf.
 pub async fn task_main(
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
     pg_listener: std::net::TcpListener,
     allowed_auth_scope: Scope,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
     // Tokio's from_std won't do this for us, per its comment.
     pg_listener.set_nonblocking(true)?;
@@ -37,10 +39,10 @@ pub async fn task_main(
         debug!("accepted connection from {}", peer_addr);
         let conf = conf.clone();
         let conn_id = issue_connection_id(&mut connection_count);
-
+        let global_timelines = global_timelines.clone();
         tokio::spawn(
             async move {
-                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
+                if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
                     error!("connection handler exited: {}", err);
                 }
             }
@@ -53,9 +55,10 @@ pub async fn task_main(
 ///
 async fn handle_socket(
     socket: TcpStream,
-    conf: SafeKeeperConf,
+    conf: Arc<SafeKeeperConf>,
     conn_id: ConnectionId,
     allowed_auth_scope: Scope,
+    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
     socket.set_nodelay(true)?;
     let peer_addr = socket.peer_addr()?;
@@ -96,8 +99,13 @@ async fn handle_socket(
         Some(_) => AuthType::NeonJWT,
     };
     let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
-    let mut conn_handler =
-        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
+    let mut conn_handler = SafekeeperPostgresHandler::new(
+        conf,
+        conn_id,
+        Some(traffic_metrics.clone()),
+        auth_pair,
+        global_timelines,
+    );
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.

From 34c1295594c70bf124c0a6a85fa21d51dff7967b Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Tue, 10 Dec 2024 12:14:28 +0200
Subject: [PATCH 092/117] [proxy] impr: Additional logging for cancellation
 queries (#10039)

## Problem
Since cancellation tasks spawned in the background sometimes logs
missing context.

https://neondb.slack.com/archives/C060N3SEF9D/p1733427801527419?thread_ts=1733419882.560159&cid=C060N3SEF9D

## Summary of changes
Add `session_id` and change loglevel for cancellation queries
---
 proxy/src/cancellation.rs           |  3 +-
 proxy/src/console_redirect_proxy.rs | 54 ++++++++++++++++-------------
 proxy/src/proxy/mod.rs              | 54 ++++++++++++++++-------------
 proxy/src/redis/notifications.rs    |  4 +++
 4 files changed, 64 insertions(+), 51 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 7bc5587a25..ed717507ee 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -115,7 +115,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
             };
             if !self.limiter.lock().unwrap().check(subnet_key, 1) {
-                tracing::debug!("Rate limit exceeded. Skipping cancellation message");
+                // log only the subnet part of the IP address to know which subnet is rate limited
+                tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
                 Metrics::get()
                     .proxy
                     .cancellation_requests_total
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 7db1179eea..65702e0e4c 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -163,32 +163,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
 
-    let (mut stream, params) =
-        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(cancel_key_data) => {
-                // spawn a task to cancel the session, but don't wait for it
-                cancellations.spawn({
-                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                    let session_id = ctx.session_id();
-                    let peer_ip = ctx.peer_addr();
-                    async move {
-                        drop(
-                            cancellation_handler_clone
-                                .cancel_session(
-                                    cancel_key_data,
-                                    session_id,
-                                    peer_ip,
-                                    config.authentication_config.ip_allowlist_check_enabled,
-                                )
-                                .await,
-                        );
-                    }
-                });
+    let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
+        .await??
+    {
+        HandshakeData::Startup(stream, params) => (stream, params),
+        HandshakeData::Cancel(cancel_key_data) => {
+            // spawn a task to cancel the session, but don't wait for it
+            cancellations.spawn({
+                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let session_id = ctx.session_id();
+                let peer_ip = ctx.peer_addr();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                cancel_span.follows_from(tracing::Span::current());
+                async move {
+                    drop(
+                        cancellation_handler_clone
+                            .cancel_session(
+                                cancel_key_data,
+                                session_id,
+                                peer_ip,
+                                config.authentication_config.ip_allowlist_check_enabled,
+                            )
+                            .instrument(cancel_span)
+                            .await,
+                    );
+                }
+            });
 
-                return Ok(None);
-            }
-        };
+            return Ok(None);
+        }
+    };
     drop(pause);
 
     ctx.set_db_options(params.clone());
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index f74eb5940f..cc04bc5e5c 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -272,32 +272,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
     let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
 
-    let (mut stream, params) =
-        match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
-            HandshakeData::Startup(stream, params) => (stream, params),
-            HandshakeData::Cancel(cancel_key_data) => {
-                // spawn a task to cancel the session, but don't wait for it
-                cancellations.spawn({
-                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
-                    let session_id = ctx.session_id();
-                    let peer_ip = ctx.peer_addr();
-                    async move {
-                        drop(
-                            cancellation_handler_clone
-                                .cancel_session(
-                                    cancel_key_data,
-                                    session_id,
-                                    peer_ip,
-                                    config.authentication_config.ip_allowlist_check_enabled,
-                                )
-                                .await,
-                        );
-                    }
-                });
+    let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
+        .await??
+    {
+        HandshakeData::Startup(stream, params) => (stream, params),
+        HandshakeData::Cancel(cancel_key_data) => {
+            // spawn a task to cancel the session, but don't wait for it
+            cancellations.spawn({
+                let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                let session_id = ctx.session_id();
+                let peer_ip = ctx.peer_addr();
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
+                cancel_span.follows_from(tracing::Span::current());
+                async move {
+                    drop(
+                        cancellation_handler_clone
+                            .cancel_session(
+                                cancel_key_data,
+                                session_id,
+                                peer_ip,
+                                config.authentication_config.ip_allowlist_check_enabled,
+                            )
+                            .instrument(cancel_span)
+                            .await,
+                    );
+                }
+            });
 
-                return Ok(None);
-            }
-        };
+            return Ok(None);
+        }
+    };
     drop(pause);
 
     ctx.set_db_options(params.clone());
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 9ac07b7e90..f3aa97c032 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -13,6 +13,7 @@ use crate::cache::project_info::ProjectInfoCache;
 use crate::cancellation::{CancelMap, CancellationHandler};
 use crate::intern::{ProjectIdInt, RoleNameInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
+use tracing::Instrument;
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
@@ -143,6 +144,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                 let peer_addr = cancel_session
                     .peer_addr
                     .unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
+                let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
+                cancel_span.follows_from(tracing::Span::current());
                 // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
                 match self
                     .cancellation_handler
@@ -152,6 +155,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                         peer_addr,
                         cancel_session.peer_addr.is_some(),
                     )
+                    .instrument(cancel_span)
                     .await
                 {
                     Ok(()) => {}

From c51db1db61c7c1513cb6fc69563e80c4110abd9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 10 Dec 2024 12:29:38 +0100
Subject: [PATCH 093/117] Replace MAX_KEYS_PER_DELETE constant with function
 (#10061)

Azure has a different per-request limit of 256 items for bulk deletion
compared to the number of 1000 on AWS. Therefore, we need to support
multiple values. Due to `GenericRemoteStorage`, we can't add an
associated constant, but it has to be a function.

The PR replaces the `MAX_KEYS_PER_DELETE` constant with a function of
the same name, implemented on both the `RemoteStorage` trait as well as
on `GenericRemoteStorage`.

The value serves as hint of how many objects to pass to the
`delete_objects` function.

Reading:

* https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch
* https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html

Part of #7931
---
 libs/remote_storage/src/azure_blob.rs        |  4 +++
 libs/remote_storage/src/lib.rs               | 27 +++++++++++++++++++-
 libs/remote_storage/src/local_fs.rs          |  4 +++
 libs/remote_storage/src/s3_bucket.rs         |  8 ++++--
 libs/remote_storage/src/simulate_failures.rs |  4 +++
 pageserver/src/deletion_queue/deleter.rs     | 10 ++++----
 storage_scrubber/src/garbage.rs              | 11 ++++----
 7 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index a1d7569140..32c51bc2ad 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -624,6 +624,10 @@ impl RemoteStorage for AzureBlobStorage {
         res
     }
 
+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_AZURE
+    }
+
     async fn copy(
         &self,
         from: &RemotePath,
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 0ece29d99e..2a3468f986 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -70,7 +70,14 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
 
 /// As defined in S3 docs
-pub const MAX_KEYS_PER_DELETE: usize = 1000;
+///
+/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
+pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
+
+/// As defined in Azure docs
+///
+/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
+pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;
 
 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 
@@ -340,6 +347,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
         cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
+    /// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
+    ///
+    /// The value returned is only an optimization hint, One can pass larger number of objects to
+    /// `delete_objects` as well.
+    ///
+    /// The value is guaranteed to be >= 1.
+    fn max_keys_per_delete(&self) -> usize;
+
     /// Deletes all objects matching the given prefix.
     ///
     /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
@@ -533,6 +548,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    /// [`RemoteStorage::max_keys_per_delete`]
+    pub fn max_keys_per_delete(&self) -> usize {
+        match self {
+            Self::LocalFs(s) => s.max_keys_per_delete(),
+            Self::AwsS3(s) => s.max_keys_per_delete(),
+            Self::AzureBlob(s) => s.max_keys_per_delete(),
+            Self::Unreliable(s) => s.max_keys_per_delete(),
+        }
+    }
+
     /// See [`RemoteStorage::delete_prefix`]
     pub async fn delete_prefix(
         &self,
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index ee2fc9d6e2..1a2d421c66 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -573,6 +573,10 @@ impl RemoteStorage for LocalFs {
         Ok(())
     }
 
+    fn max_keys_per_delete(&self) -> usize {
+        super::MAX_KEYS_PER_DELETE_S3
+    }
+
     async fn copy(
         &self,
         from: &RemotePath,
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index cde32df402..2891f92d07 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -48,7 +48,7 @@ use crate::{
     metrics::{start_counting_cancelled_wait, start_measuring_requests},
     support::PermitCarrying,
     ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
-    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
+    RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
     REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
@@ -355,7 +355,7 @@ impl S3Bucket {
         let kind = RequestKind::Delete;
         let mut cancel = std::pin::pin!(cancel.cancelled());
 
-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
             let started_at = start_measuring_requests(kind);
 
             let req = self
@@ -832,6 +832,10 @@ impl RemoteStorage for S3Bucket {
         self.delete_oids(&permit, &delete_objects, cancel).await
     }
 
+    fn max_keys_per_delete(&self) -> usize {
+        MAX_KEYS_PER_DELETE_S3
+    }
+
     async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
         let paths = std::array::from_ref(path);
         self.delete_objects(paths, cancel).await
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 10db53971c..51833c1fe6 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -203,6 +203,10 @@ impl RemoteStorage for UnreliableWrapper {
         Ok(())
     }
 
+    fn max_keys_per_delete(&self) -> usize {
+        self.inner.max_keys_per_delete()
+    }
+
     async fn copy(
         &self,
         from: &RemotePath,
diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs
index 3d02387c98..ef1dfbac19 100644
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -9,7 +9,6 @@
 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
 use remote_storage::TimeoutOrCancel;
-use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -131,7 +130,8 @@ impl Deleter {
     }
 
     pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
-        self.accumulator.reserve(MAX_KEYS_PER_DELETE);
+        let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
+        self.accumulator.reserve(max_keys_per_delete);
 
         loop {
             if self.cancel.is_cancelled() {
@@ -156,14 +156,14 @@ impl Deleter {
 
             match msg {
                 DeleterMessage::Delete(mut list) => {
-                    while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
-                        if self.accumulator.len() == MAX_KEYS_PER_DELETE {
+                    while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
+                        if self.accumulator.len() == max_keys_per_delete {
                             self.flush().await?;
                             // If we have received this number of keys, proceed with attempting to execute
                             assert_eq!(self.accumulator.len(), 0);
                         }
 
-                        let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
+                        let available_slots = max_keys_per_delete - self.accumulator.len();
                         let take_count = std::cmp::min(available_slots, list.len());
                         for path in list.drain(list.len() - take_count..) {
                             self.accumulator.push(path);
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index b026efbc3b..a4e5107e3d 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -459,12 +459,10 @@ pub async fn get_timeline_objects(
     Ok(list.keys)
 }
 
-const MAX_KEYS_PER_DELETE: usize = 1000;
-
 /// Drain a buffer of keys into DeleteObjects requests
 ///
 /// If `drain` is true, drains keys completely; otherwise stops when <
-/// MAX_KEYS_PER_DELETE keys are left.
+/// `max_keys_per_delete`` keys are left.
 /// `num_deleted` returns number of deleted keys.
 async fn do_delete(
     remote_client: &GenericRemoteStorage,
@@ -474,9 +472,10 @@ async fn do_delete(
     progress_tracker: &mut DeletionProgressTracker,
 ) -> anyhow::Result<()> {
     let cancel = CancellationToken::new();
-    while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
+    let max_keys_per_delete = remote_client.max_keys_per_delete();
+    while (!keys.is_empty() && drain) || (keys.len() >= max_keys_per_delete) {
         let request_keys =
-            keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
+            keys.split_off(keys.len() - (std::cmp::min(max_keys_per_delete, keys.len())));
 
         let request_keys: Vec<RemotePath> = request_keys.into_iter().map(|o| o.key).collect();
 
@@ -617,7 +616,7 @@ pub async fn purge_garbage(
         }
 
         objects_to_delete.append(&mut object_list);
-        if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
+        if objects_to_delete.len() >= remote_client.max_keys_per_delete() {
             do_delete(
                 &remote_client,
                 &mut objects_to_delete,

From ad472bd4a18a6fb03310b631fc565cfbc28c0a82 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 10 Dec 2024 13:07:00 +0100
Subject: [PATCH 094/117] test_runner: add visibility map test (#9940)

Verifies that visibility map pages are correctly maintained across
shards.

Touches #9914.
---
 test_runner/regress/test_vm_bits.py | 75 ++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index f93fc6bd8b..46e90852a6 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -4,7 +4,7 @@ import time
 from contextlib import closing
 
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn
 from fixtures.utils import query_scalar
 
 
@@ -292,3 +292,76 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
     tup = cur.fetchall()
     log.info(f"tuple = {tup}")
     cur.execute("commit transaction")
+
+
+def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Runs pgbench across a few databases on a sharded tenant, then performs a visibility map
+    consistency check. Regression test for https://github.com/neondatabase/neon/issues/9914.
+    """
+
+    # Use a large number of shards with small stripe sizes, to ensure the visibility
+    # map will end up on non-zero shards.
+    SHARD_COUNT = 8
+    STRIPE_SIZE = 32  # in 8KB pages
+    PGBENCH_RUNS = 4
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=SHARD_COUNT, initial_tenant_shard_stripe_size=STRIPE_SIZE
+    )
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers = 64MB",
+        ],
+    )
+
+    # Run pgbench in 4 different databases, to exercise different shards.
+    dbnames = [f"pgbench{i}" for i in range(PGBENCH_RUNS)]
+    for i, dbname in enumerate(dbnames):
+        log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}")
+        endpoint.safe_psql(f"create database {dbname}")
+        connstr = endpoint.connstr(dbname=dbname)
+        # pgbench -i will automatically vacuum the tables. This creates the visibility map.
+        pg_bin.run(["pgbench", "-i", "-s", "10", connstr])
+        # Freeze the tuples to set the initial frozen bit.
+        endpoint.safe_psql("vacuum freeze", dbname=dbname)
+        # Run pgbench.
+        pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr])
+
+    # Restart the endpoint to flush the compute page cache. We want to make sure we read VM pages
+    # from storage, not cache.
+    endpoint.stop()
+    endpoint.start()
+
+    # Check that the visibility map matches the heap contents for pg_accounts (the main table).
+    for dbname in dbnames:
+        log.info(f"Checking visibility map for {dbname}")
+        with endpoint.cursor(dbname=dbname) as cur:
+            cur.execute("create extension pg_visibility")
+
+            cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
+            row = cur.fetchone()
+            assert row is not None
+            assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
+
+            cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
+            row = cur.fetchone()
+            assert row is not None
+            assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
+
+    # Vacuum and freeze the tables, and check that the visibility map is still accurate.
+    for dbname in dbnames:
+        log.info(f"Vacuuming and checking visibility map for {dbname}")
+        with endpoint.cursor(dbname=dbname) as cur:
+            cur.execute("vacuum freeze")
+
+            cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
+            row = cur.fetchone()
+            assert row is not None
+            assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
+
+            cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
+            row = cur.fetchone()
+            assert row is not None
+            assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"

From 311ee793b972e6dfc856f560a273734dd57a0c77 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 10 Dec 2024 17:01:40 +0200
Subject: [PATCH 095/117] Fix handling in-flight requersts in prefetch buffer
 resize (#9968)

## Problem

See https://github.com/neondatabase/neon/issues/9961
Current implementation of prefetch buffer resize doesn't correctly
handle in-flight requests

## Summary of changes

1. Fix index of entry we should wait for if new prefetch buffer size is
smaller than number of in-flight requests.
2. Correctly set flush position

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 880c0de64e..385905d9ce 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
 	 * ensuring we have received all but the last n requests (n = newsize).
 	 */
 	if (MyPState->n_requests_inflight > newsize)
-		prefetch_wait_for(MyPState->ring_unused - newsize);
+	{
+		Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
+		prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}
 
 	/* construct the new PrefetchState, and copy over the memory contexts */
 	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_last = newsize;
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
-	newPState->ring_flush = newsize;
 	newPState->max_shard_no = MyPState->max_shard_no;
 	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
 
@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		}
 		newPState->n_unused -= 1;
 	}
+	newPState->ring_flush = newPState->ring_receive;
 
 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
 		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
 		if (slot->status == PRFS_RECEIVED)
 		{
 			pfree(slot->response);

From 6ad99826c1d175e12dea50120041bc0822830be6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 10 Dec 2024 10:23:26 -0500
Subject: [PATCH 096/117] fix(pageserver): refresh_gc_info should always
 increase cutoff (#9862)

## Problem

close https://github.com/neondatabase/cloud/issues/19671

```
Timeline -----------------------------
         ^ last GC happened LSN
              ^ original retention period setting = 24hr
> refresh-gc-info updates the gc_info
              ^ planned cutoff (gc_info)
         ^ customer set retention to 48hr, and it's still within the last GC LSN
         ^1   ^2 we have two choices: (1) update the planned cutoff to
                 move backwards, or (2) keep the current one
```

In this patch, we decided to keep the current cutoff instead of moving
back the gc_info to avoid races. In the future, we could allow the
planned gc cutoff to go back once cplane sends a retention_history
tenant config update, but this requires a careful revisit of the code.

## Summary of changes

Ensure that GC cutoffs never go back if retention settings get changed.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e71a56ed40..54fa95fc47 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4506,7 +4506,12 @@ impl Tenant {
                 // - this timeline was created while we were finding cutoffs
                 // - lsn for timestamp search fails for this timeline repeatedly
                 if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
-                    target.cutoffs = cutoffs.clone();
+                    let original_cutoffs = target.cutoffs.clone();
+                    // GC cutoffs should never go back
+                    target.cutoffs = GcCutoffs {
+                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
+                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                    }
                 }
             }
 

From b853f7813606a02af226d2ef80a92d10dafca527 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 10 Dec 2024 18:26:56 +0200
Subject: [PATCH 097/117] Print a log message if GetPage response takes too
 long (#10046)

We have metrics for GetPage request latencies, but this is an extra
measure to capture requests that take way too long in the logs. The log
message is printed every 10 s, until the response is received:

```
PG:2024-12-09 16:02:07.715 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 10.000 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:17.723 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] no response received from pageserver for 20.008 s, still waiting (sent 10613 requests, received 10612 responses)
PG:2024-12-09 16:02:19.719 GMT [1782845] LOG:  [NEON_SMGR] [shard 0] received response from pageserver after 22.006 s
```
---
 pgxn/neon/libpagestore.c | 67 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index b60ae41af3..6513ba4dd6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -22,6 +22,7 @@
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "portability/instr_time.h"
 #include "postmaster/interrupt.h"
 #include "storage/buf_internals.h"
 #include "storage/ipc.h"
@@ -118,6 +119,11 @@ typedef struct
 	 */
 	PSConnectionState state;
 	PGconn		   *conn;
+
+	/* request / response counters for debugging */
+	uint64			nrequests_sent;
+	uint64			nresponses_received;
+
 	/*---
 	 * WaitEventSet containing:
 	 *	- WL_SOCKET_READABLE on 'conn'
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		}
 
 		shard->state = PS_Connected;
+		shard->nrequests_sent = 0;
+		shard->nresponses_received = 0;
 	}
 	/* FALLTHROUGH */
 	case PS_Connected:
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
 	int			ret;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
+	instr_time	now,
+				start_ts,
+				since_start,
+				last_log_ts,
+				since_last_log;
+	bool		logged = false;
+
+	/*
+	 * As a debugging aid, if we don't get a response for a long time, print a
+	 * log message.
+	 *
+	 * 10 s is a very generous threshold, normally we expect a response in a
+	 * few milliseconds. We have metrics to track latencies in normal ranges,
+	 * but in the cases that take exceptionally long, it's useful to log the
+	 * exact timestamps.
+	 */
+#define LOG_INTERVAL_US		UINT64CONST(10 * 1000000)
+
+	INSTR_TIME_SET_CURRENT(now);
+	start_ts = last_log_ts = now;
+	INSTR_TIME_SET_ZERO(since_last_log);
 
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -663,9 +692,12 @@ retry:
 	if (ret == 0)
 	{
 		WaitEvent	event;
+		long		timeout;
+
+		timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
 
 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
+		(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
 								WAIT_EVENT_NEON_PS_READ);
 		ResetLatch(MyLatch);
 
@@ -684,9 +716,40 @@ retry:
 			}
 		}
 
+		/*
+		 * Print a message to the log if a long time has passed with no
+		 * response.
+		 */
+		INSTR_TIME_SET_CURRENT(now);
+		since_last_log = now;
+		INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
+		if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
+		{
+			since_start = now;
+			INSTR_TIME_SUBTRACT(since_start, start_ts);
+			neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
+						   INSTR_TIME_GET_DOUBLE(since_start),
+						   shard->nrequests_sent, shard->nresponses_received);
+			last_log_ts = now;
+			logged = true;
+		}
+
 		goto retry;
 	}
 
+	/*
+	 * If we logged earlier that the response is taking a long time, log
+	 * another message when the response is finally received.
+	 */
+	if (logged)
+	{
+		INSTR_TIME_SET_CURRENT(now);
+		since_start = now;
+		INSTR_TIME_SUBTRACT(since_start, start_ts);
+		neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
+					   INSTR_TIME_GET_DOUBLE(since_start));
+	}
+
 	return ret;
 }
 
@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
 	 * PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
 	 * point, but on the grand scheme of things it's only a small issue.
 	 */
+	shard->nrequests_sent++;
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
 		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
 	}
 
+	shard->nresponses_received++;
 	return (NeonResponse *) resp;
 }
 

From aa0554fd1ef003ddd6a51e5c40944351caa130ac Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 10 Dec 2024 12:00:47 -0500
Subject: [PATCH 098/117] feat(test_runner): allowed_errors in storage scrubber
 (#10062)

## Problem

resolve
https://github.com/neondatabase/neon/issues/9988#issuecomment-2528239437

## Summary of changes

* New verbose mode for storage scrubber scan metadata (pageserver) that
contains the error messages.
* Filter allowed_error list from the JSON output to determine the
healthy flag status.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_scrubber/src/main.rs                  |  8 ++-
 .../src/scan_pageserver_metadata.rs           | 39 +++++++++----
 test_runner/fixtures/neon_fixtures.py         | 58 ++++++++++++++++++-
 test_runner/regress/test_storage_scrubber.py  |  8 ++-
 4 files changed, 98 insertions(+), 15 deletions(-)

diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 92979d609e..fa6ee90b66 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -86,6 +86,8 @@ enum Command {
         /// For safekeeper node_kind only, json list of timelines and their lsn info
         #[arg(long, default_value = None)]
         timeline_lsns: Option<String>,
+        #[arg(long, default_value_t = false)]
+        verbose: bool,
     },
     TenantSnapshot {
         #[arg(long = "tenant-id")]
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
             dump_db_connstr,
             dump_db_table,
             timeline_lsns,
+            verbose,
         } => {
             if let NodeKind::Safekeeper = node_kind {
                 let db_or_list = match (timeline_lsns, dump_db_connstr) {
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
                     tenant_ids,
                     json,
                     post_to_storcon,
+                    verbose,
                     cli.exit_code,
                 )
                 .await
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
         Vec::new(),
         true,
         post_to_storcon,
+        false, // default to non-verbose mode
         exit_code,
     )
     .await?;
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
     tenant_shard_ids: Vec<TenantShardId>,
     json: bool,
     post_to_storcon: bool,
+    verbose: bool,
     exit_code: bool,
 ) -> anyhow::Result<()> {
     if controller_client.is_none() && post_to_storcon {
         return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
     }
-    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
+    match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
         Err(e) => {
             tracing::error!("Failed: {e}");
             Err(e)
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index cb3299d413..c8de6e46b3 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -21,8 +21,12 @@ pub struct MetadataSummary {
     tenant_count: usize,
     timeline_count: usize,
     timeline_shard_count: usize,
-    with_errors: HashSet<TenantShardTimelineId>,
-    with_warnings: HashSet<TenantShardTimelineId>,
+    /// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_errors: HashMap<String, Vec<String>>,
+    /// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
+    /// The key is generated using `TenantShardTimelineId::to_string()`.
+    with_warnings: HashMap<String, Vec<String>>,
     with_orphans: HashSet<TenantShardTimelineId>,
     indices_by_version: HashMap<usize, usize>,
 
@@ -52,7 +56,12 @@ impl MetadataSummary {
         }
     }
 
-    fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
+    fn update_analysis(
+        &mut self,
+        id: &TenantShardTimelineId,
+        analysis: &TimelineAnalysis,
+        verbose: bool,
+    ) {
         if analysis.is_healthy() {
             self.healthy_tenant_shards.insert(id.tenant_shard_id);
         } else {
@@ -61,11 +70,17 @@ impl MetadataSummary {
         }
 
         if !analysis.errors.is_empty() {
-            self.with_errors.insert(*id);
+            let entry = self.with_errors.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.errors.iter().cloned());
+            }
         }
 
         if !analysis.warnings.is_empty() {
-            self.with_warnings.insert(*id);
+            let entry = self.with_warnings.entry(id.to_string()).or_default();
+            if verbose {
+                entry.extend(analysis.warnings.iter().cloned());
+            }
         }
     }
 
@@ -120,6 +135,7 @@ Index versions: {version_summary}
 pub async fn scan_pageserver_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
+    verbose: bool,
 ) -> anyhow::Result<MetadataSummary> {
     let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
         mut tenant_objects: TenantObjectListing,
         timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
         highest_shard_count: ShardCount,
+        verbose: bool,
     ) {
         summary.tenant_count += 1;
 
@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
                         Some(data),
                     )
                     .await;
-                    summary.update_analysis(&ttid, &analysis);
+                    summary.update_analysis(&ttid, &analysis, verbose);
 
                     timeline_ids.insert(ttid.timeline_id);
                 } else {
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
         summary.update_data(&data);
 
         match tenant_id {
-            None => {
-                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
-                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
-            }
             Some(prev_tenant_id) => {
                 if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
                     // New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
                         tenant_objects,
                         timelines,
                         highest_shard_count,
+                        verbose,
                     )
                     .instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
                     .await;
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
                     highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
                 }
             }
+            None => {
+                tenant_id = Some(ttid.tenant_shard_id.tenant_id);
+                highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
+            }
         }
 
         match &data.blob_data {
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
             tenant_objects,
             tenant_timeline_results,
             highest_shard_count,
+            verbose,
         )
         .instrument(info_span!("analyze-tenant", tenant = %tenant_id))
         .await;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 60c4a23936..8354432c0c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4556,6 +4556,7 @@ class StorageScrubber:
     def __init__(self, env: NeonEnv, log_dir: Path):
         self.env = env
         self.log_dir = log_dir
+        self.allowed_errors: list[str] = []
 
     def scrubber_cli(
         self, args: list[str], timeout, extra_env: dict[str, str] | None = None
@@ -4633,19 +4634,70 @@ class StorageScrubber:
         if timeline_lsns is not None:
             args.append("--timeline-lsns")
             args.append(json.dumps(timeline_lsns))
+        if node_kind == NodeKind.PAGESERVER:
+            args.append("--verbose")
         stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
 
         try:
             summary = json.loads(stdout)
-            # summary does not contain "with_warnings" if node_kind is the safekeeper
-            no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
-            healthy = not summary["with_errors"] and no_warnings
+            healthy = self._check_run_healthy(summary)
             return healthy, summary
         except:
             log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
             log.error(stdout)
             raise
 
+    def _check_line_allowed(self, line: str) -> bool:
+        for a in self.allowed_errors:
+            try:
+                if re.match(a, line):
+                    return True
+            except re.error:
+                log.error(f"Invalid regex: '{a}'")
+                raise
+        return False
+
+    def _check_line_list_allowed(self, lines: list[str]) -> bool:
+        for line in lines:
+            if not self._check_line_allowed(line):
+                return False
+        return True
+
+    def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
+        # summary does not contain "with_warnings" if node_kind is the safekeeper
+        healthy = True
+        with_warnings = summary.get("with_warnings", None)
+        if with_warnings is not None:
+            if isinstance(with_warnings, list):
+                if len(with_warnings) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, warnings in with_warnings.items():
+                    assert (
+                        len(warnings) > 0
+                    ), "with_warnings value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(warnings):
+                        healthy = False
+                        break
+        if not healthy:
+            return healthy
+        with_errors = summary.get("with_errors", None)
+        if with_errors is not None:
+            if isinstance(with_errors, list):
+                if len(with_errors) > 0:
+                    # safekeeper scan_metadata output is a list of tenants
+                    healthy = False
+            else:
+                for _, errors in with_errors.items():
+                    assert (
+                        len(errors) > 0
+                    ), "with_errors value should not be empty, running without verbose mode?"
+                    if not self._check_line_list_allowed(errors):
+                        healthy = False
+                        break
+        return healthy
+
     def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
         stdout = self.scrubber_cli(
             ["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index b16dc54c24..198e4f0460 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata(
     unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
     assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
 
-    neon_env_builder.disable_scrub_on_exit()
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert not healthy
+    env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*")
+    healthy, _ = env.storage_scrubber.scan_metadata()
+    assert healthy
+
+    neon_env_builder.disable_scrub_on_exit()  # We already ran scrubber, no need to do an extra run

From e71d20d3928dd03e1159ec5a08cfa902cf85cd31 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 10 Dec 2024 20:42:52 +0100
Subject: [PATCH 099/117] Emit nbtree vacuum cycle id in nbtree xlog through
 forced FPIs (#9932)

This fixes neondatabase/neon#9929.

## Postgres repo PRS:
- PG17: https://github.com/neondatabase/postgres/pull/538
- PG16: https://github.com/neondatabase/postgres/pull/539
- PG15: https://github.com/neondatabase/postgres/pull/540
- PG14: https://github.com/neondatabase/postgres/pull/541

## Problem
see #9929

## Summary of changes

We update the split code to force the code to emit an FPI whenever the
cycle ID might be interesting for concurrent btree vacuum.
---
 .../regress/test_nbtree_pagesplit_cycleid.py  | 124 ++++++++++++++++++
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 6 files changed, 132 insertions(+), 8 deletions(-)
 create mode 100644 test_runner/regress/test_nbtree_pagesplit_cycleid.py

diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
new file mode 100644
index 0000000000..558557aeba
--- /dev/null
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -0,0 +1,124 @@
+import threading
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+BTREE_NUM_CYCLEID_PAGES = """
+    WITH raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    ),
+    parsed_pages AS (
+        /* cycle ID is the last 2 bytes of the btree page */
+        SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
+        FROM raw_pages
+    )
+    SELECT count(*),
+           encode(cycle_id, 'hex')
+     FROM parsed_pages
+    WHERE encode(cycle_id, 'hex') != '0000'
+    GROUP BY encode(cycle_id, 'hex');
+    """
+
+
+def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    ses1 = endpoint.connect().cursor()
+    ses1.execute("ALTER SYSTEM SET autovacuum = off;")
+    ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
+    ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
+    ses1.execute("SELECT pg_reload_conf();")
+    ses1.execute("CREATE EXTENSION neon_test_utils;")
+    # prepare a large index
+    ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
+    ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
+
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 0
+    ), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
+    # Delete enough tuples to clear the first index page.
+    # (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
+    ses1.execute("DELETE FROM t WHERE id <= 406;")
+    # Make sure the page is cleaned up
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Do another delete-then-indexcleanup cycle, to move the pages from
+    # "dead" to "reusable"
+    ses1.execute("DELETE FROM t WHERE id <= 446;")
+    ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
+
+    # Make sure the vacuum we're about to trigger in s3 has cleanup work to do
+    ses1.execute("DELETE FROM t WHERE id <= 610;")
+
+    # Flush wal, for checking purposes
+    ses1.execute("SELECT neon_xlogflush();")
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
+
+    ses2 = endpoint.connect().cursor()
+    ses3 = endpoint.connect().cursor()
+
+    # Session 2 pins a btree page, which prevents vacuum from processing that
+    # page, thus allowing us to reliably split pages while a concurrent vacuum
+    # is running.
+    ses2.execute("BEGIN;")
+    ses2.execute(
+        "DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
+    )
+    ses2.execute("FETCH FROM foo;")  # pins the leaf page with id 611
+    wait_evt = threading.Event()
+
+    # Session 3 runs the VACUUM command. Note that this will block, and
+    # therefore must run on another thread.
+    # We rely on this running quickly enough to hit the pinned page from
+    # session 2 by the time we start other work again in session 1, but
+    # technically there is a race where the thread (and/or PostgreSQL process)
+    # don't get to that pinned page with vacuum until >2s after evt.set() was
+    # called, and session 1 thus might already have split pages.
+    def vacuum_freeze_t(ses3, evt: threading.Event):
+        # Begin parallel vacuum that should hit the index
+        evt.set()
+        # this'll hang until s2 fetches enough new data from its cursor.
+        # this is technically a race with the time.sleep(2) below, but if this
+        # command doesn't hit
+        ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
+
+    ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
+    ses3t.start()
+    wait_evt.wait()
+    # Make extra sure we got the thread started and vacuum is stuck, by waiting
+    # some time even after wait_evt got set. This isn't truly reliable (it is
+    # possible
+    time.sleep(2)
+
+    # Insert 2 pages worth of new data.
+    # This should reuse the one empty page, plus another page at the end of
+    # the index relation; with split ordering
+    #    old_blk -> blkno=1 -> old_blk + 1.
+    # As this is run while vacuum in session 3 is happening, these splits
+    # should receive cycle IDs where applicable.
+    ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
+    # unpin the btree page, allowing s3's vacuum to complete
+    ses2.execute("FETCH ALL FROM foo;")
+    ses2.execute("ROLLBACK;")
+    # flush WAL to make sure PS is up-to-date
+    ses1.execute("SELECT neon_xlogflush();")
+    # check that our expectations are correct
+    ses1.execute(BTREE_NUM_CYCLEID_PAGES)
+    pages = ses1.fetchall()
+    assert (
+        len(pages) == 1 and pages[0][0] == 3
+    ), f"3 page splits with cycle ID expected; actual {pages}"
+
+    # final cleanup
+    ses3t.join()
+    ses1.close()
+    ses2.close()
+    ses3.close()
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 373f9decad..13ff324150 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd
+Subproject commit 13ff324150fceaac72920e01742addc053db9462
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 972e325e62..8736b10c1d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9
+Subproject commit 8736b10c1d93d11b9c0489872dd529c4c0f5338f
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index dff6615a8e..81428621f7 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92
+Subproject commit 81428621f7c04aed03671cf80a928e0a36d92505
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index a10d95be67..471c449ab8 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71
+Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8a73e14dcf..ba0f34e23e 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "a10d95be67265e0f10a422ba0457f5a7af01de71"
+    "471c449ab8f8ff5988b6bfb9eafa0a79772ad562"
   ],
   "v16": [
     "16.6",
-    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
+    "81428621f7c04aed03671cf80a928e0a36d92505"
   ],
   "v15": [
     "15.10",
-    "972e325e62b455957adbbdd8580e31275bb5b8c9"
+    "8736b10c1d93d11b9c0489872dd529c4c0f5338f"
   ],
   "v14": [
     "14.15",
-    "373f9decad933d2d46f321231032ae8b0da81acd"
+    "13ff324150fceaac72920e01742addc053db9462"
   ]
 }

From 597125e124b3d92ef3a0ca243722aa0f99238037 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Wed, 11 Dec 2024 01:51:05 +0100
Subject: [PATCH 100/117] Disable readstream's reliance on seqscan readahead
 (#9860)

Neon doesn't have seqscan detection of its own, so stop read_stream from
trying to utilize that readahead, and instead make it issue readahead of
its own.

## Problem

@knizhnik noticed that we didn't issue smgrprefetch[v] calls for
seqscans in PG17 due to the move to the read_stream API, which assumes
that the underlying IO facilities do seqscan detection for readahead.
That is a wrong assumption when Neon is involved, so let's remove the
code that applies that assumption.

## Summary of changes
Remove the cases where seqscans are detected and prefetch is disabled as
a consequence, and instead don't do that detection.

PG PR: https://github.com/neondatabase/postgres/pull/532
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 471c449ab8..01fa3c4866 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 471c449ab8f8ff5988b6bfb9eafa0a79772ad562
+Subproject commit 01fa3c48664ca030cfb69bb4a350aa9df4691d88
diff --git a/vendor/revisions.json b/vendor/revisions.json
index ba0f34e23e..7329aa437f 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.2",
-    "471c449ab8f8ff5988b6bfb9eafa0a79772ad562"
+    "01fa3c48664ca030cfb69bb4a350aa9df4691d88"
   ],
   "v16": [
     "16.6",

From 38415a9816dab1ccd1fadea77857aafb369abdf6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 11 Dec 2024 09:16:11 +0000
Subject: [PATCH 101/117] pageserver: fix ingest handling of CLog truncate
 (#10080)

## Problem

In #9786 we stop storing SLRUs on non-zero shards.

However, there was one code path during ingest that still tries to
enumerate SLRU relations on all shards. This fails if it sees a tenant
who has never seen any write to an SLRU, or who has done such thorough
compaction+GC that it has dropped its SLRU directory key.

## Summary of changes

- Avoid trying to list SLRU relations on nonzero shards
---
 pageserver/src/walingest.rs | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 30c8965d51..b7712cfac7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -877,22 +877,24 @@ impl WalIngest {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        for segno in modification
-            .tline
-            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
-            .await?
-        {
-            let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            for segno in modification
+                .tline
+                .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
+                .await?
+            {
+                let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
 
-            let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
-            });
+                let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
+                    pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
+                });
 
-            if may_delete {
-                modification
-                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
-                    .await?;
-                trace!("Drop CLOG segment {:>04X}", segno);
+                if may_delete {
+                    modification
+                        .drop_slru_segment(SlruKind::Clog, segno, ctx)
+                        .await?;
+                    trace!("Drop CLOG segment {:>04X}", segno);
+                }
             }
         }
 

From d7aeca2f343675f17f2adee05082f89644715469 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 11 Dec 2024 10:41:34 +0100
Subject: [PATCH 102/117] CI(deploy): create git tags/releases before
 triggering deploy workflows (#10022)

## Problem

When dev deployments are disabled (or fail), the tags for releases
aren't created. It makes more sense to have tag and release creation
before the deployment to prevent situations like
[this](https://github.com/neondatabase/neon/pull/9959).

It is not enough to move the tag creation before the deployment. If the
deployment fails, re-running the job isn't possible because the API call
to create the tag will fail.

## Summary of changes

- Tag/Release creation now happens before the deployment
- The two steps for tag and release have been merged into a bigger one
- There's new checks to ensure the that if the tags/releases already
exist as expected, things will continue just fine.
---
 .github/workflows/build_and_test.yml | 93 +++++++++++++++++++---------
 1 file changed, 64 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cb966f292e..6023d1bb6f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1066,6 +1066,70 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Create git tag and GitHub release
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+        uses: actions/github-script@v7
+        with:
+          retries: 5
+          script: |
+            const tag = "${{ needs.tag.outputs.build-tag }}";
+
+            try {
+              const existingRef = await github.rest.git.getRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `tags/${tag}`,
+              });
+
+              if (existingRef.data.object.sha !== context.sha) {
+                throw new Error(`Tag ${tag} already exists but points to a different commit (expected: ${context.sha}, actual: ${existingRef.data.object.sha}).`);
+              }
+
+              console.log(`Tag ${tag} already exists and points to ${context.sha} as expected.`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Tag ${tag} does not exist. Creating it...`);
+              await github.rest.git.createRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: `refs/tags/${tag}`,
+                sha: context.sha,
+              });
+              console.log(`Tag ${tag} created successfully.`);
+            }
+
+            # TODO: check how GitHub releases looks for proxy/compute releases and enable them if they're ok
+            if (context.ref !== 'refs/heads/release') {
+              console.log(`GitHub release skipped for ${context.ref}.`);
+              return;
+            }
+
+            try {
+              const existingRelease = await github.rest.repos.getReleaseByTag({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag: tag,
+              });
+
+              console.log(`Release for tag ${tag} already exists (ID: ${existingRelease.data.id}).`);
+            } catch (error) {
+              if (error.status !== 404) {
+                throw error;
+              }
+
+              console.log(`Release for tag ${tag} does not exist. Creating it...`);
+              await github.rest.repos.createRelease({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                tag_name: tag,
+                generate_release_notes: true,
+              });
+              console.log(`Release for tag ${tag} created successfully.`);
+            }
+
       - name: Trigger deploy workflow
         env:
           GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
@@ -1115,35 +1179,6 @@ jobs:
             exit 1
           fi
 
-      - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.git.createRef({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
-              sha: context.sha,
-            })
-
-      # TODO: check how GitHub releases looks for proxy releases and enable it if it's ok
-      - name: Create GitHub release
-        if: github.ref_name == 'release'
-        uses: actions/github-script@v7
-        with:
-          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
-          retries: 5
-          script: |
-            await github.rest.repos.createRelease({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              tag_name: "${{ needs.tag.outputs.build-tag }}",
-              generate_release_notes: true,
-            })
-
   # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
   promote-compatibility-data:
     needs: [ deploy ]

From 665369c439d6ec1d107dea7ccc80ce64b080297a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 12:35:02 +0000
Subject: [PATCH 103/117] wal_decoder: fix compact key protobuf encoding
 (#10074)

## Problem

Protobuf doesn't support 128 bit integers, so we encode the keys as two
64 bit integers. Issue is that when we split the 128 bit compact key we
use signed 64 bit integers to represent the two halves. This may result
in a negative lower half when relnode is larger than `0x00800000`. When
we convert the lower half to an i128 we get a negative `CompactKey`.

## Summary of Changes

Use unsigned integers when encoding into Protobuf.

## Deployment

* Prod: We disabled the interpreted proto, so no compat concerns.
* Staging: Disable the interpreted proto, do one release, and then
release the fixed version.
We do this because a negative int32 will convert to a large uint32 value
and could give
a key in the actual pageserver space. In production we would around this
by adding new
fields to the proto and deprecating the old ones, but we can make our
lives easy here.
* Pre-prod: Same as staging
---
 libs/pageserver_api/src/key.rs               |  2 +-
 libs/wal_decoder/proto/interpreted_wal.proto |  4 +-
 libs/wal_decoder/src/wire_format.rs          | 65 +++++++++++++++++++-
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 37dff6fe46..373329c9b4 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,7 @@ pub struct Key {
 
 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
+#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
 pub struct CompactKey(i128);
 
 /// The storage key size.
diff --git a/libs/wal_decoder/proto/interpreted_wal.proto b/libs/wal_decoder/proto/interpreted_wal.proto
index 0393392c1a..d68484d30f 100644
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -37,7 +37,7 @@ message ValueMeta {
 }
 
 message CompactKey {
-  int64 high = 1;
-  int64 low = 2;
+  uint64 high = 1;
+  uint64 low = 2;
 }
 
diff --git a/libs/wal_decoder/src/wire_format.rs b/libs/wal_decoder/src/wire_format.rs
index 5a343054c3..944ee5c919 100644
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -236,8 +236,8 @@ impl From<ValueMeta> for proto::ValueMeta {
 impl From<CompactKey> for proto::CompactKey {
     fn from(value: CompactKey) -> Self {
         proto::CompactKey {
-            high: (value.raw() >> 64) as i64,
-            low: value.raw() as i64,
+            high: (value.raw() >> 64) as u64,
+            low: value.raw() as u64,
         }
     }
 }
@@ -354,3 +354,64 @@ impl From<proto::CompactKey> for CompactKey {
         (((value.high as i128) << 64) | (value.low as i128)).into()
     }
 }
+
+#[test]
+fn test_compact_key_with_large_relnode() {
+    use pageserver_api::key::Key;
+
+    let inputs = vec![
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x007FFFFF,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800000,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0x100,
+            field3: 0x200,
+            field4: 0x00800001,
+            field5: 0x10,
+            field6: 0x5,
+        },
+        Key {
+            field1: 0,
+            field2: 0xFFFFFFFF,
+            field3: 0xFFFFFFFF,
+            field4: 0xFFFFFFFF,
+            field5: 0x0,
+            field6: 0x0,
+        },
+    ];
+
+    for input in inputs {
+        assert!(input.is_valid_key_on_write_path());
+        let compact = input.to_compact();
+        let proto: proto::CompactKey = compact.into();
+        let from_proto: CompactKey = proto.into();
+
+        assert_eq!(
+            compact, from_proto,
+            "Round trip failed for key with relnode={:#x}",
+            input.field4
+        );
+    }
+}

From 9ae980bf4f320c20dc48adce90e92e74fd2ea45c Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 11 Dec 2024 14:37:08 +0100
Subject: [PATCH 104/117] page_service: don't count time spent in Batcher
 towards smgr latency metrics (#10075)

## Problem

With pipelining enabled, the time a request spends in the batcher stage
counts towards the smgr op latency.

If pipelining is disabled, that time is not accounted for.

In practice, this results in a jump in smgr getpage latencies in various
dashboards and degrades the internal SLO.

## Solution

In a similar vein to #10042 and with a similar rationale, this PR stops
counting the time spent in batcher stage towards smgr op latency.

The smgr op latency metric is reduced to the actual execution time.

Time spent in batcher stage is tracked in a separate histogram.
I expect to remove that histogram after batching rollout is complete,
but it will be helpful in the meantime to reason about the rollout.
---
 pageserver/src/metrics.rs         | 168 +++++++++++++++++++++---------
 pageserver/src/page_service.rs    |  13 ++-
 pageserver/src/tenant/throttle.rs |  19 ++--
 test_runner/fixtures/metrics.py   |   1 +
 4 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 96ee157856..b4e20cb8b9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -16,7 +16,6 @@ use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
-use tracing::warn;
 use utils::id::TimelineId;
 
 /// Prometheus histogram buckets (in seconds) for operations in the critical
@@ -1225,32 +1224,58 @@ pub(crate) mod virtual_file_io_engine {
 
 pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
 pub(crate) struct SmgrOpTimerInner {
-    global_latency_histo: Histogram,
+    global_execution_latency_histo: Histogram,
+    per_timeline_execution_latency_histo: Option<Histogram>,
 
-    // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<Histogram>,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
 
-    start: Instant,
-    throttled: Duration,
-    op: SmgrQueryType,
+    timings: SmgrOpTimerState,
+}
+
+#[derive(Debug)]
+enum SmgrOpTimerState {
+    Received {
+        received_at: Instant,
+    },
+    ThrottleDoneExecutionStarting {
+        received_at: Instant,
+        throttle_started_at: Instant,
+        started_execution_at: Instant,
+    },
 }
 
 pub(crate) struct SmgrOpFlushInProgress {
-    base: Instant,
+    flush_started_at: Instant,
     global_micros: IntCounter,
     per_timeline_micros: IntCounter,
 }
 
 impl SmgrOpTimer {
-    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
-        let Some(throttle) = throttle else {
-            return;
-        };
+    pub(crate) fn observe_throttle_done_execution_starting(&mut self, throttle: &ThrottleResult) {
         let inner = self.0.as_mut().expect("other public methods consume self");
-        inner.throttled += *throttle;
+        match (&mut inner.timings, throttle) {
+            (SmgrOpTimerState::Received { received_at }, throttle) => match throttle {
+                ThrottleResult::NotThrottled { start } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *received_at,
+                        throttle_started_at: *start,
+                        started_execution_at: *start,
+                    };
+                }
+                ThrottleResult::Throttled { start, end } => {
+                    inner.timings = SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                        received_at: *start,
+                        throttle_started_at: *start,
+                        started_execution_at: *end,
+                    };
+                }
+            },
+            (x, _) => panic!("called in unexpected state: {x:?}"),
+        }
     }
 
     pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
@@ -1263,7 +1288,7 @@ impl SmgrOpTimer {
             ..
         } = inner;
         SmgrOpFlushInProgress {
-            base: flush_start,
+            flush_started_at: flush_start,
             global_micros: global_flush_in_progress_micros,
             per_timeline_micros: per_timeline_flush_in_progress_micros,
         }
@@ -1274,32 +1299,42 @@ impl SmgrOpTimer {
         let inner = self.0.take()?;
 
         let now = Instant::now();
-        let elapsed = now - inner.start;
 
-        let elapsed = match elapsed.checked_sub(inner.throttled) {
-            Some(elapsed) => elapsed,
-            None => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[inner.op];
-                rate_limit.call(|| {
-                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
-                });
-                elapsed // un-throttled time, more info than just saturating to 0
+        let batch;
+        let execution;
+        let throttle;
+        match inner.timings {
+            SmgrOpTimerState::Received { received_at } => {
+                batch = (now - received_at).as_secs_f64();
+                // TODO: use label for dropped requests.
+                // This is quite rare in practice, only during tenant/pageservers shutdown.
+                throttle = Duration::ZERO;
+                execution = Duration::ZERO.as_secs_f64();
             }
-        };
+            SmgrOpTimerState::ThrottleDoneExecutionStarting {
+                received_at,
+                throttle_started_at,
+                started_execution_at,
+            } => {
+                batch = (throttle_started_at - received_at).as_secs_f64();
+                throttle = started_execution_at - throttle_started_at;
+                execution = (now - started_execution_at).as_secs_f64();
+            }
+        }
 
-        let elapsed = elapsed.as_secs_f64();
+        // update time spent in batching
+        inner.global_batch_wait_time.observe(batch);
+        inner.per_timeline_batch_wait_time.observe(batch);
 
-        inner.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(elapsed);
+        // time spent in throttle metric is updated by throttle impl
+        let _ = throttle;
+
+        // update metrics for execution latency
+        inner.global_execution_latency_histo.observe(execution);
+        if let Some(per_timeline_execution_latency_histo) =
+            &inner.per_timeline_execution_latency_histo
+        {
+            per_timeline_execution_latency_histo.observe(execution);
         }
 
         Some((now, inner))
@@ -1325,12 +1360,12 @@ impl SmgrOpFlushInProgress {
         // Last call is tracked in `now`.
         let mut observe_guard = scopeguard::guard(
             || {
-                let elapsed = now - self.base;
+                let elapsed = now - self.flush_started_at;
                 self.global_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
                 self.per_timeline_micros
                     .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.base = now;
+                self.flush_started_at = now;
             },
             |mut observe| {
                 observe();
@@ -1377,6 +1412,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_batch_size: Histogram,
     global_flush_in_progress_micros: IntCounter,
     per_timeline_flush_in_progress_micros: IntCounter,
+    global_batch_wait_time: Histogram,
+    per_timeline_batch_wait_time: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1399,12 +1436,15 @@ static SMGR_QUERY_STARTED_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|
     .expect("failed to define a metric")
 });
 
+// Alias so all histograms recording per-timeline smgr timings use the same buckets.
+static SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS: &[f64] = CRITICAL_OP_BUCKETS;
+
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds",
-        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
+        "Time spent _executing_ smgr query handling, excluding batch and throttle delays.",
         &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
     )
     .expect("failed to define a metric")
 });
@@ -1462,7 +1502,7 @@ static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
-        "Time spent on smgr query handling, aggregated by query type.",
+        "Like pageserver_smgr_query_seconds, but aggregated to instance level.",
         &["smgr_query_type"],
         SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
@@ -1559,6 +1599,25 @@ static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds",
+        "Time a request spent waiting in its batch until the batch moved to throttle&execution.",
+        &["tenant_id", "shard_id", "timeline_id"],
+        SMGR_QUERY_TIME_PER_TENANT_TIMELINE_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_pagestream_batch_wait_time_seconds_global",
+        "Like pageserver_page_service_pagestream_batch_wait_time_seconds, but aggregated to instance level.",
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.to_vec(),
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,6 +1658,11 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL.clone();
+        let per_timeline_batch_wait_time = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         let global_flush_in_progress_micros =
             PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
         let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1614,9 +1678,11 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_batch_size,
             global_flush_in_progress_micros,
             per_timeline_flush_in_progress_micros,
+            global_batch_wait_time,
+            per_timeline_batch_wait_time,
         }
     }
-    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, received_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
@@ -1627,15 +1693,15 @@ impl SmgrQueryTimePerTimeline {
         };
 
         SmgrOpTimer(Some(SmgrOpTimerInner {
-            global_latency_histo: self.global_latency[op as usize].clone(),
-            per_timeline_latency_histo,
-            start: started_at,
-            op,
-            throttled: Duration::ZERO,
+            global_execution_latency_histo: self.global_latency[op as usize].clone(),
+            per_timeline_execution_latency_histo: per_timeline_latency_histo,
+            timings: SmgrOpTimerState::Received { received_at },
             global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
             per_timeline_flush_in_progress_micros: self
                 .per_timeline_flush_in_progress_micros
                 .clone(),
+            global_batch_wait_time: self.global_batch_wait_time.clone(),
+            per_timeline_batch_wait_time: self.per_timeline_batch_wait_time.clone(),
         }))
     }
 
@@ -2889,6 +2955,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_BATCH_WAIT_TIME.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2919,6 +2990,7 @@ use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::throttle::ThrottleResult;
 use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
@@ -3773,6 +3845,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &REMOTE_ONDEMAND_DOWNLOADED_BYTES,
         &CIRCUIT_BREAKERS_BROKEN,
         &CIRCUIT_BREAKERS_UNBROKEN,
+        &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
     ]
     .into_iter()
     .for_each(|c| {
@@ -3820,6 +3893,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
         &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
+        &PAGE_SERVICE_SMGR_BATCH_WAIT_TIME_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 97d94bbe7f..d00ec11a76 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -575,7 +575,10 @@ enum BatchedFeMessage {
 }
 
 impl BatchedFeMessage {
-    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+    async fn throttle_and_record_start_processing(
+        &mut self,
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         let (shard, tokens, timers) = match self {
             BatchedFeMessage::Exists { shard, timer, .. }
             | BatchedFeMessage::Nblocks { shard, timer, .. }
@@ -603,7 +606,7 @@ impl BatchedFeMessage {
             }
         };
         for timer in timers {
-            timer.deduct_throttle(&throttled);
+            timer.observe_throttle_done_execution_starting(&throttled);
         }
         Ok(())
     }
@@ -1230,7 +1233,7 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+            if let Err(cancelled) = msg.throttle_and_record_start_processing(&self.cancel).await {
                 break cancelled;
             }
 
@@ -1397,7 +1400,9 @@ impl PageServerHandler {
                             return Err(e);
                         }
                     };
-                    batch.throttle(&self.cancel).await?;
+                    batch
+                        .throttle_and_record_start_processing(&self.cancel)
+                        .await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 54c0e59daa..8ab6a0e060 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -58,6 +58,11 @@ pub struct Stats {
     pub sum_throttled_usecs: u64,
 }
 
+pub enum ThrottleResult {
+    NotThrottled { start: Instant },
+    Throttled { start: Instant, end: Instant },
+}
+
 impl<M> Throttle<M>
 where
     M: Metric,
@@ -122,15 +127,15 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> ThrottleResult {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
 
-        if !inner.enabled {
-            return None;
-        }
-
         let start = std::time::Instant::now();
 
+        if !inner.enabled {
+            return ThrottleResult::NotThrottled { start };
+        }
+
         self.metric.accounting_start();
         self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
         let did_throttle = inner.rate_limiter.acquire(key_count).await;
@@ -145,9 +150,9 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            Some(wait_time)
+            ThrottleResult::Throttled { start, end: now }
         } else {
-            None
+            ThrottleResult::NotThrottled { start }
         }
     }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index a591e088ef..c5295360c3 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -178,6 +178,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_timeline_wal_records_received"),
     counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
+    *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken

From a53db7385151d3bc9b18d7e70b110cee36d5d32c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 14:28:18 +0000
Subject: [PATCH 105/117] pageserver: don't drop multixact slrus on non zero
 shards (#10086)

## Problem

We get slru truncation commands on non-zero shards.
Compaction will drop the slru dir keys and ingest will fail when
receiving such records.
https://github.com/neondatabase/neon/pull/10080 fixed it for clog, but
not for multixact.

## Summary of changes

Only truncate multixact slrus on shard zero. I audited the rest of the
ingest code and it looks
fine from this pov.
---
 pageserver/src/walingest.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index b7712cfac7..e5b23fed51 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1049,16 +1049,18 @@ impl WalIngest {
 
         // Delete all the segments except the last one. The last segment can still
         // contain, possibly partially, valid data.
-        while segment != endsegment {
-            modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
-                .await?;
+        if modification.tline.get_shard_identity().is_shard_zero() {
+            while segment != endsegment {
+                modification
+                    .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
+                    .await?;
 
-            /* move to next segment, handling wraparound correctly */
-            if segment == maxsegment {
-                segment = 0;
-            } else {
-                segment += 1;
+                /* move to next segment, handling wraparound correctly */
+                if segment == maxsegment {
+                    segment = 0;
+                } else {
+                    segment += 1;
+                }
             }
         }
 

From c79c1dd8e90a8d813c0e02d8dcbfaaae99779b8e Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 11 Dec 2024 15:03:11 +0000
Subject: [PATCH 106/117] compute_ctl: don't panic if control plane can't be
 reached (#10078)

## Problem

If the control plane cannot be reached for some reason, compute_ctl
panics

## Summary of changes

panic is removed in favour of returning an error.
Code is reformatted a bit for more flat control flow

Resolves: #5391
---
 compute_tools/src/bin/compute_ctl.rs | 79 ++++++++++++++--------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e73ccd908e..bb248734a8 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -246,47 +246,48 @@ fn try_spec_from_cli(
     let compute_id = matches.get_one::<String>("compute-id");
     let control_plane_uri = matches.get_one::<String>("control-plane-uri");
 
-    let spec;
-    let mut live_config_allowed = false;
-    match spec_json {
-        // First, try to get cluster spec from the cli argument
-        Some(json) => {
-            info!("got spec from cli argument {}", json);
-            spec = Some(serde_json::from_str(json)?);
-        }
-        None => {
-            // Second, try to read it from the file if path is provided
-            if let Some(sp) = spec_path {
-                let path = Path::new(sp);
-                let file = File::open(path)?;
-                spec = Some(serde_json::from_reader(file)?);
-                live_config_allowed = true;
-            } else if let Some(id) = compute_id {
-                if let Some(cp_base) = control_plane_uri {
-                    live_config_allowed = true;
-                    spec = match get_spec_from_control_plane(cp_base, id) {
-                        Ok(s) => s,
-                        Err(e) => {
-                            error!("cannot get response from control plane: {}", e);
-                            panic!("neither spec nor confirmation that compute is in the Empty state was received");
-                        }
-                    };
-                } else {
-                    panic!("must specify both --control-plane-uri and --compute-id or none");
-                }
-            } else {
-                panic!(
-                    "compute spec should be provided by one of the following ways: \
-                    --spec OR --spec-path OR --control-plane-uri and --compute-id"
-                );
-            }
-        }
+    // First, try to get cluster spec from the cli argument
+    if let Some(spec_json) = spec_json {
+        info!("got spec from cli argument {}", spec_json);
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_str(spec_json)?),
+            live_config_allowed: false,
+        });
+    }
+
+    // Second, try to read it from the file if path is provided
+    if let Some(spec_path) = spec_path {
+        let file = File::open(Path::new(spec_path))?;
+        return Ok(CliSpecParams {
+            spec: Some(serde_json::from_reader(file)?),
+            live_config_allowed: true,
+        });
+    }
+
+    let Some(compute_id) = compute_id else {
+        panic!(
+            "compute spec should be provided by one of the following ways: \
+                --spec OR --spec-path OR --control-plane-uri and --compute-id"
+        );
+    };
+    let Some(control_plane_uri) = control_plane_uri else {
+        panic!("must specify both --control-plane-uri and --compute-id or none");
     };
 
-    Ok(CliSpecParams {
-        spec,
-        live_config_allowed,
-    })
+    match get_spec_from_control_plane(control_plane_uri, compute_id) {
+        Ok(spec) => Ok(CliSpecParams {
+            spec,
+            live_config_allowed: true,
+        }),
+        Err(e) => {
+            error!(
+                "cannot get response from control plane: {}\n\
+                neither spec nor confirmation that compute is in the Empty state was received",
+                e
+            );
+            Err(e)
+        }
+    }
 }
 
 struct CliSpecParams {

From b987648e713b7842be1a41ab2eca898a9c90eefb Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 11 Dec 2024 16:28:10 +0100
Subject: [PATCH 107/117] Enable LFC for all the PG versions. (#10068)

## Problem
We added support for LFC for tests but are still using it only for the
PG17 release.

## Summary of changes
LFC is enabled for all PG versions. Errors in tests with LFC enabled now
block merging as usual. We keep tests with disabled LFC for PG17
release. Tests on debug builds with LFC enabled still don't affect
permission to merge.
---
 .github/workflows/_build-and-test-locally.yml |  2 +-
 .github/workflows/build_and_test.yml          | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 42c32a23e3..7d47f78d6b 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -283,7 +283,7 @@ jobs:
           submodules: true
 
       - name: Pytest regression tests
-        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' }}
+        continue-on-error: ${{ matrix.lfc_state == 'with-lfc' && inputs.build-type == 'debug' }}
         uses: ./.github/actions/run-python-test-set
         timeout-minutes: 60
         with:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6023d1bb6f..ee22f2ff54 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -255,15 +255,17 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       build-tag: ${{ needs.tag.outputs.build-tag }}
       build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      # run without LFC on v17 release only
+      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds.
+      # Run without LFC on v17 release and debug builds only. For all the other cases LFC is enabled. Failure on the
+      # debug build with LFC enabled doesn't block merging.
       test-cfg: |
-        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v15", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v16", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "without-lfc"},
-                                                {"pg_version":"v17", "lfc_state": "with-lfc"}]'
-                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"}]' }}
+        ${{ matrix.build-type == 'release' && '[{"pg_version":"v14", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v15", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v16", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "without-lfc"}]'
+                                           || '[{"pg_version":"v17", "lfc_state": "without-lfc"},
+                                                {"pg_version":"v17", "lfc_state": "with-lfc" }]' }}
     secrets: inherit
 
   # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking

From e4bb1ca7d82ed1d6663459fcfea3f6be78be00ea Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 11 Dec 2024 18:46:50 +0300
Subject: [PATCH 108/117] Increase neon_local http client to compute timeout in
 reconfigure. (#10088)

Seems like 30s sometimes not enough when CI runners are overloaded,
causing pull_timeline flakiness.

ref
https://github.com/neondatabase/neon/issues/9731#issuecomment-2535946443
---
 control_plane/src/endpoint.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 35067c95b6..1fdf326051 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -810,7 +810,7 @@ impl Endpoint {
         }
 
         let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(30))
+            .timeout(Duration::from_secs(120))
             .build()
             .unwrap();
         let response = client

From dee2041cd3a7bd74b732933890345cd354fd1c80 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 11 Dec 2024 16:23:59 +0000
Subject: [PATCH 109/117] walproposer: fix link error on debian 12 / ubuntu 22
 (#10090)

## Problem

Linking walproposer library (e.g. `cargo t`) produces linker errors:
/home/myrrc/neon/pgxn/neon/walproposer_compat.c:169: undefined reference
to `pg_snprintf'

The library with these symbols (libpgcommon.a) is present

## Summary of changes

Changed order of libraries resolution for linker
---
 libs/walproposer/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/walproposer/build.rs b/libs/walproposer/build.rs
index 3f549889b8..8d5b1ade35 100644
--- a/libs/walproposer/build.rs
+++ b/libs/walproposer/build.rs
@@ -30,9 +30,9 @@ fn main() -> anyhow::Result<()> {
     let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
     let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
 
+    println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-lib=static=pgport");
     println!("cargo:rustc-link-lib=static=pgcommon");
-    println!("cargo:rustc-link-lib=static=walproposer");
     println!("cargo:rustc-link-search={walproposer_lib_search_str}");
 
     // Rebuild crate when libwalproposer.a changes

From ef233e91ef7446cbf70898aafdc98b8335c53c59 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 11 Dec 2024 16:43:26 +0000
Subject: [PATCH 110/117] Update compute_installed_extensions metric: (#9891)

add owned_by_superuser field to filter out system extensions.

While on it, also correct related code:
- fix the metric setting: use set() instead of inc() in a loop.
inc() is not idempotent and can lead to incorrect results
if the function called multiple times. Currently it is only called at
compute start, but this will change soon.
- fix the return type of the installed_extensions endpoint
to match the metric. Currently it is only used in the test.
---
 compute_tools/src/http/openapi_spec.yaml      |  6 +-
 compute_tools/src/installed_extensions.rs     | 55 ++++++++++++-------
 libs/compute_api/src/responses.rs             |  4 +-
 .../regress/test_installed_extensions.py      | 28 +++++-----
 4 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 7b9a62c545..24a67cac71 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -537,12 +537,14 @@ components:
             properties:
               extname:
                 type: string
-              versions:
-                type: array
+              version:
+                type: string
                 items:
                   type: string
               n_databases:
                 type: integer
+              owned_by_superuser:
+                type: integer
 
     SetRoleGrantsRequest:
       type: object
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 5f62f08858..0ab259ddf1 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,7 +1,6 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
-use std::collections::HashSet;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
@@ -38,61 +37,77 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
 ///
 /// Same extension can be installed in multiple databases with different versions,
-/// we only keep the highest and lowest version across all databases.
+/// so we report a separate metric (number of databases where it is installed)
+/// for each extension version.
 pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
     conf.application_name("compute_ctl:get_installed_extensions");
     let mut client = conf.connect(NoTls)?;
-
     let databases: Vec<String> = list_dbs(&mut client)?;
 
-    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new();
     for db in databases.iter() {
         conf.dbname(db);
         let mut db_client = conf.connect(NoTls)?;
-        let extensions: Vec<(String, String)> = db_client
+        let extensions: Vec<(String, String, i32)> = db_client
             .query(
-                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
                 &[],
             )?
             .iter()
-            .map(|row| (row.get("extname"), row.get("extversion")))
+            .map(|row| {
+                (
+                    row.get("extname"),
+                    row.get("extversion"),
+                    row.get("extowner"),
+                )
+            })
             .collect();
 
-        for (extname, v) in extensions.iter() {
+        for (extname, v, extowner) in extensions.iter() {
             let version = v.to_string();
 
-            // increment the number of databases where the version of extension is installed
-            INSTALLED_EXTENSIONS
-                .with_label_values(&[extname, &version])
-                .inc();
+            // check if the extension is owned by superuser
+            // 10 is the oid of superuser
+            let owned_by_superuser = if *extowner == 10 { "1" } else { "0" };
 
             extensions_map
-                .entry(extname.to_string())
+                .entry((
+                    extname.to_string(),
+                    version.clone(),
+                    owned_by_superuser.to_string(),
+                ))
                 .and_modify(|e| {
-                    e.versions.insert(version.clone());
                     // count the number of databases where the extension is installed
                     e.n_databases += 1;
                 })
                 .or_insert(InstalledExtension {
                     extname: extname.to_string(),
-                    versions: HashSet::from([version.clone()]),
+                    version: version.clone(),
                     n_databases: 1,
+                    owned_by_superuser: owned_by_superuser.to_string(),
                 });
         }
     }
 
-    let res = InstalledExtensions {
-        extensions: extensions_map.into_values().collect(),
-    };
+    for (key, ext) in extensions_map.iter() {
+        let (extname, version, owned_by_superuser) = key;
+        let n_databases = ext.n_databases as u64;
 
-    Ok(res)
+        INSTALLED_EXTENSIONS
+            .with_label_values(&[extname, version, owned_by_superuser])
+            .set(n_databases);
+    }
+
+    Ok(InstalledExtensions {
+        extensions: extensions_map.into_values().collect(),
+    })
 }
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "compute_installed_extensions",
         "Number of databases where the version of extension is installed",
-        &["extension_name", "version"]
+        &["extension_name", "version", "owned_by_superuser"]
     )
     .expect("failed to define a metric")
 });
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 79234be720..0d65f6a38d 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,6 +1,5 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
-use std::collections::HashSet;
 use std::fmt::Display;
 
 use chrono::{DateTime, Utc};
@@ -163,8 +162,9 @@ pub enum ControlPlaneComputeStatus {
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct InstalledExtension {
     pub extname: String,
-    pub versions: HashSet<String>,
+    pub version: String,
     pub n_databases: u32, // Number of databases using this extension
+    pub owned_by_superuser: String,
 }
 
 #[derive(Clone, Debug, Default, Serialize)]
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
index 04ccec5875..4e51e7e10c 100644
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -30,7 +30,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     info("Extensions: %s", res["extensions"])
     # 'plpgsql' is a default extension that is always installed.
     assert any(
-        ext["extname"] == "plpgsql" and ext["versions"] == ["1.0"] for ext in res["extensions"]
+        ext["extname"] == "plpgsql" and ext["version"] == "1.0" for ext in res["extensions"]
     ), "The 'plpgsql' extension is missing"
 
     # check that the neon_test_utils extension is not installed
@@ -63,7 +63,7 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # and has the expected version
     assert any(
         ext["extname"] == "neon_test_utils"
-        and ext["versions"] == [neon_test_utils_version]
+        and ext["version"] == neon_test_utils_version
         and ext["n_databases"] == 1
         for ext in res["extensions"]
     )
@@ -75,9 +75,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # check that the neon extension is installed and has expected versions
     for ext in res["extensions"]:
         if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.1", "1.2"]
+            assert ext["version"] in ["1.1", "1.2"]
+            assert ext["n_databases"] == 1
 
     with pg_conn.cursor() as cur:
         cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
@@ -90,9 +89,8 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     # check that the neon_test_utils extension is updated
     for ext in res["extensions"]:
         if ext["extname"] == "neon":
-            assert ext["n_databases"] == 2
-            ext["versions"].sort()
-            assert ext["versions"] == ["1.2", "1.3"]
+            assert ext["version"] in ["1.2", "1.3"]
+            assert ext["n_databases"] == 1
 
     # check that /metrics endpoint is available
     # ensure that we see the metric before and after restart
@@ -100,13 +98,15 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
     info("Metrics: %s", res)
     m = parse_metrics(res)
     neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
     )
     assert len(neon_m) == 1
     for sample in neon_m:
-        assert sample.value == 2
+        assert sample.value == 1
     neon_m = m.query_all(
-        "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+        "compute_installed_extensions",
+        {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
     )
     assert len(neon_m) == 1
     for sample in neon_m:
@@ -138,14 +138,16 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
         info("After restart metrics: %s", res)
         m = parse_metrics(res)
         neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.2"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.2", "owned_by_superuser": "1"},
         )
         assert len(neon_m) == 1
         for sample in neon_m:
             assert sample.value == 1
 
         neon_m = m.query_all(
-            "compute_installed_extensions", {"extension_name": "neon", "version": "1.3"}
+            "compute_installed_extensions",
+            {"extension_name": "neon", "version": "1.3", "owned_by_superuser": "1"},
         )
         assert len(neon_m) == 1
         for sample in neon_m:

From a3e80448e8af32f13bcfa3a605e7b9a22d74a01e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 19:16:33 +0000
Subject: [PATCH 111/117] pageserver/storcon: add patch endpoints for tenant
 config metrics  (#10020)

## Problem

Cplane and storage controller tenant config changes are not additive.
Any change overrides all existing tenant configs. This would be fine if
both did client side patching, but that's not the case.

Once this merges, we must update cplane to use the PATCH endpoint.

## Summary of changes

### High Level

Allow for patching of tenant configuration with a `PATCH
/v1/tenant/config` endpoint.
It takes the same data as it's PUT counterpart. For example the payload
below will update `gc_period` and unset `compaction_period`. All other
fields are left in their original state.
```
{
  "tenant_id": "1234",
  "gc_period": "10s",
  "compaction_period": null
}
```

### Low Level
* PS and storcon gain `PATCH /v1/tenant/config` endpoints. PS endpoint
is only used for cplane managed instances.
* `storcon_cli` is updated to have separate commands for
`set-tenant-config` and `patch-tenant-config`

Related https://github.com/neondatabase/cloud/issues/21043
---
 control_plane/src/pageserver.rs               |   2 +-
 control_plane/storcon_cli/src/main.rs         |  34 ++-
 libs/pageserver_api/src/models.rs             | 261 +++++++++++++++++-
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/try_rcu.rs                     |  77 ++++++
 pageserver/client/src/mgmt_api.rs             |   8 +-
 pageserver/pagebench/src/cmd/aux_files.rs     |   2 +-
 pageserver/src/http/openapi_spec.yml          |  22 +-
 pageserver/src/http/routes.rs                 |  46 ++-
 pageserver/src/tenant.rs                      |  28 +-
 pageserver/src/tenant/config.rs               | 125 ++++++++-
 storage_controller/src/http.rs                |  33 ++-
 storage_controller/src/service.rs             |  80 +++++-
 test_runner/fixtures/pageserver/http.py       |  29 +-
 .../regress/test_disk_usage_eviction.py       |   4 +-
 .../regress/test_ingestion_layer_size.py      |   2 +-
 .../regress/test_layers_from_future.py        |   2 +-
 .../test_pageserver_crash_consistency.py      |   2 +-
 .../regress/test_storage_controller.py        |   4 +-
 test_runner/regress/test_tenant_conf.py       |  83 +++++-
 .../regress/test_threshold_based_eviction.py  |   2 +-
 .../regress/test_timeline_detach_ancestor.py  |   2 +-
 22 files changed, 800 insertions(+), 50 deletions(-)
 create mode 100644 libs/utils/src/try_rcu.rs

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 1d1455b95b..9d3f018345 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -435,7 +435,7 @@ impl PageServerNode {
     ) -> anyhow::Result<()> {
         let config = Self::parse_config(settings)?;
         self.http_client
-            .tenant_config(&models::TenantConfigRequest { tenant_id, config })
+            .set_tenant_config(&models::TenantConfigRequest { tenant_id, config })
             .await?;
 
         Ok(())
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index e879424532..df07216fde 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,8 @@ use pageserver_api::{
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
-        TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest,
+        TenantShardSplitRequest, TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -116,9 +116,19 @@ enum Command {
         #[arg(long)]
         tenant_shard_id: TenantShardId,
     },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
+    /// Set the pageserver tenant configuration of a tenant: this is the configuration structure
     /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
+    /// Any previous tenant configs are overwritten.
+    SetTenantConfig {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        config: String,
+    },
+    /// Patch the pageserver tenant configuration of a tenant. Any fields with null values in the
+    /// provided JSON are unset from the tenant config and all fields with non-null values are set.
+    /// Unspecified fields are not changed.
+    PatchTenantConfig {
         #[arg(long)]
         tenant_id: TenantId,
         #[arg(long)]
@@ -549,11 +559,21 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
-        Command::TenantConfig { tenant_id, config } => {
+        Command::SetTenantConfig { tenant_id, config } => {
             let tenant_conf = serde_json::from_str(&config)?;
 
             vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
+                    tenant_id,
+                    config: tenant_conf,
+                })
+                .await?;
+        }
+        Command::PatchTenantConfig { tenant_id, config } => {
+            let tenant_conf = serde_json::from_str(&config)?;
+
+            vps_client
+                .patch_tenant_config(&TenantConfigPatchRequest {
                     tenant_id,
                     config: tenant_conf,
                 })
@@ -736,7 +756,7 @@ async fn main() -> anyhow::Result<()> {
             threshold,
         } => {
             vps_client
-                .tenant_config(&TenantConfigRequest {
+                .set_tenant_config(&TenantConfigRequest {
                     tenant_id,
                     config: TenantConfig {
                         eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5488f7b2c2..5690b643f0 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -17,7 +17,7 @@ use std::{
 
 use byteorder::{BigEndian, ReadBytesExt};
 use postgres_ffi::BLCKSZ;
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 use utils::{
     completion,
@@ -325,6 +325,115 @@ impl Default for ShardParameters {
     }
 }
 
+#[derive(Debug, Default, Clone, Eq, PartialEq)]
+pub enum FieldPatch<T> {
+    Upsert(T),
+    Remove,
+    #[default]
+    Noop,
+}
+
+impl<T> FieldPatch<T> {
+    fn is_noop(&self) -> bool {
+        matches!(self, FieldPatch::Noop)
+    }
+
+    pub fn apply(self, target: &mut Option<T>) {
+        match self {
+            Self::Upsert(v) => *target = Some(v),
+            Self::Remove => *target = None,
+            Self::Noop => {}
+        }
+    }
+
+    pub fn map<U, E, F: FnOnce(T) -> Result<U, E>>(self, map: F) -> Result<FieldPatch<U>, E> {
+        match self {
+            Self::Upsert(v) => Ok(FieldPatch::<U>::Upsert(map(v)?)),
+            Self::Remove => Ok(FieldPatch::<U>::Remove),
+            Self::Noop => Ok(FieldPatch::<U>::Noop),
+        }
+    }
+}
+
+impl<'de, T: Deserialize<'de>> Deserialize<'de> for FieldPatch<T> {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        Option::deserialize(deserializer).map(|opt| match opt {
+            None => FieldPatch::Remove,
+            Some(val) => FieldPatch::Upsert(val),
+        })
+    }
+}
+
+impl<T: Serialize> Serialize for FieldPatch<T> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        match self {
+            FieldPatch::Upsert(val) => serializer.serialize_some(val),
+            FieldPatch::Remove => serializer.serialize_none(),
+            FieldPatch::Noop => unreachable!(),
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
+#[serde(default)]
+pub struct TenantConfigPatch {
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_distance: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub checkpoint_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_target_size: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_threshold: FieldPatch<usize>,
+    // defer parsing compaction_algorithm, like eviction_policy
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub compaction_algorithm: FieldPatch<CompactionAlgorithmSettings>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_horizon: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_creation_threshold: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub pitr_interval: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub walreceiver_connect_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lagging_wal_timeout: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub max_lsn_wal_lag: FieldPatch<NonZeroU64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub eviction_policy: FieldPatch<EvictionPolicy>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub min_resident_size_override: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub evictions_low_residence_duration_metric_threshold: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub heatmap_period: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lazy_slru_download: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_get_throttle: FieldPatch<ThrottleConfig>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_creation_check_threshold: FieldPatch<u8>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub lsn_lease_length_for_ts: FieldPatch<String>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub timeline_offloading: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
+}
+
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -356,6 +465,107 @@ pub struct TenantConfig {
     pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
 }
 
+impl TenantConfig {
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch.compaction_period.apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch.gc_period.apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch.pitr_interval.apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .apply(&mut walreceiver_connect_timeout);
+        patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch.heatmap_period.apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch.lsn_lease_length.apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        }
+    }
+}
+
 /// The policy for the aux file storage.
 ///
 /// It can be switched through `switch_aux_file_policy` tenant config.
@@ -686,6 +896,14 @@ impl TenantConfigRequest {
     }
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantConfigPatchRequest {
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: TenantConfigPatch, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
 #[serde(tag = "slug", content = "data", rename_all = "snake_case")]
@@ -1699,4 +1917,45 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_tenant_config_patch_request_serde() {
+        let patch_request = TenantConfigPatchRequest {
+            tenant_id: TenantId::from_str("17c6d121946a61e5ab0fe5a2fd4d8215").unwrap(),
+            config: TenantConfigPatch {
+                checkpoint_distance: FieldPatch::Upsert(42),
+                gc_horizon: FieldPatch::Remove,
+                compaction_threshold: FieldPatch::Noop,
+                ..TenantConfigPatch::default()
+            },
+        };
+
+        let json = serde_json::to_string(&patch_request).unwrap();
+
+        let expected = r#"{"tenant_id":"17c6d121946a61e5ab0fe5a2fd4d8215","checkpoint_distance":42,"gc_horizon":null}"#;
+        assert_eq!(json, expected);
+
+        let decoded: TenantConfigPatchRequest = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.tenant_id, patch_request.tenant_id);
+        assert_eq!(decoded.config, patch_request.config);
+
+        // Now apply the patch to a config to demonstrate semantics
+
+        let base = TenantConfig {
+            checkpoint_distance: Some(28),
+            gc_horizon: Some(100),
+            compaction_target_size: Some(1024),
+            ..Default::default()
+        };
+
+        let expected = TenantConfig {
+            checkpoint_distance: Some(42),
+            gc_horizon: None,
+            ..base.clone()
+        };
+
+        let patched = base.apply_patch(decoded.config);
+
+        assert_eq!(patched, expected);
+    }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index d9b82b20da..bccd0e0488 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -94,6 +94,8 @@ pub mod toml_edit_ext;
 
 pub mod circuit_breaker;
 
+pub mod try_rcu;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/try_rcu.rs b/libs/utils/src/try_rcu.rs
new file mode 100644
index 0000000000..6b53ab1316
--- /dev/null
+++ b/libs/utils/src/try_rcu.rs
@@ -0,0 +1,77 @@
+//! Try RCU extension lifted from <https://github.com/vorner/arc-swap/issues/94#issuecomment-1987154023>
+
+pub trait ArcSwapExt<T> {
+    /// [`ArcSwap::rcu`](arc_swap::ArcSwap::rcu), but with Result that short-circuits on error.
+    fn try_rcu<R, F, E>(&self, f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>;
+}
+
+impl<T, S> ArcSwapExt<T> for arc_swap::ArcSwapAny<T, S>
+where
+    T: arc_swap::RefCnt,
+    S: arc_swap::strategy::CaS<T>,
+{
+    fn try_rcu<R, F, E>(&self, mut f: F) -> Result<T, E>
+    where
+        F: FnMut(&T) -> Result<R, E>,
+        R: Into<T>,
+    {
+        fn ptr_eq<Base, A, B>(a: A, b: B) -> bool
+        where
+            A: arc_swap::AsRaw<Base>,
+            B: arc_swap::AsRaw<Base>,
+        {
+            let a = a.as_raw();
+            let b = b.as_raw();
+            std::ptr::eq(a, b)
+        }
+
+        let mut cur = self.load();
+        loop {
+            let new = f(&cur)?.into();
+            let prev = self.compare_and_swap(&*cur, new);
+            let swapped = ptr_eq(&*cur, &*prev);
+            if swapped {
+                return Ok(arc_swap::Guard::into_inner(prev));
+            } else {
+                cur = prev;
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arc_swap::ArcSwap;
+    use std::sync::Arc;
+
+    #[test]
+    fn test_try_rcu_success() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<_, String> { Ok(**value + 1) });
+
+        assert!(result.is_ok());
+        assert_eq!(**swap.load(), 43);
+    }
+
+    #[test]
+    fn test_try_rcu_error() {
+        let swap = ArcSwap::from(Arc::new(42));
+
+        let result = swap.try_rcu(|value| -> Result<i32, _> {
+            if **value == 42 {
+                Err("err")
+            } else {
+                Ok(**value + 1)
+            }
+        });
+
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err(), "err");
+        assert_eq!(**swap.load(), 42);
+    }
+}
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index c3a1ef8140..4e9b11879d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -270,12 +270,18 @@ impl Client {
         Ok(body)
     }
 
-    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
         let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
         self.request(Method::PUT, &uri, req).await?;
         Ok(())
     }
 
+    pub async fn patch_tenant_config(&self, req: &TenantConfigPatchRequest) -> Result<()> {
+        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
+        self.request(Method::PATCH, &uri, req).await?;
+        Ok(())
+    }
+
     pub async fn tenant_secondary_download(
         &self,
         tenant_id: TenantShardId,
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index 923a7f1f18..b869a0c6c7 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -64,7 +64,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     println!("operating on timeline {}", timeline);
 
     mgmt_api_client
-        .tenant_config(&TenantConfigRequest {
+        .set_tenant_config(&TenantConfigRequest {
             tenant_id: timeline.tenant_id,
             config: TenantConfig::default(),
         })
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 7fb9247feb..ee43440534 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -767,7 +767,27 @@ paths:
   /v1/tenant/config:
     put:
       description: |
-        Update tenant's config.
+        Update tenant's config by setting it to the provided value
+
+        Invalid fields in the tenant config will cause the request to be rejected with status 400.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantConfigRequest"
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TenantInfo"
+    patch:
+      description: |
+        Update tenant's config additively by patching the updated fields provided.
+        Null values unset the field and non-null values upsert it.
 
         Invalid fields in the tenant config will cause the request to be rejected with status 400.
       requestBody:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 75d25d0a6a..6e9ee976f4 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -28,6 +28,7 @@ use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
+use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -1695,7 +1696,47 @@ async fn update_tenant_config_handler(
     crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
         .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
-    tenant.set_new_tenant_config(new_tenant_conf);
+
+    let _ = tenant
+        .update_tenant_config(|_crnt| Ok(new_tenant_conf.clone()))
+        .expect("Closure returns Ok()");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn patch_tenant_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantConfigPatchRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+    let updated = tenant
+        .update_tenant_config(|crnt| crnt.apply_patch(request_data.config.clone()))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is a legacy API that only operates on attached tenants: the preferred
+    // API to use is the location_config/ endpoint, which lets the caller provide
+    // the full LocationConf.
+    let location_conf = LocationConf::attached_single(
+        updated,
+        tenant.get_generation(),
+        &ShardParameters::default(),
+    );
+
+    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+        .await
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
 
     json_response(StatusCode::OK, ())
 }
@@ -3288,6 +3329,9 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
             api_handler(r, tenant_size_handler)
         })
+        .patch("/v1/tenant/config", |r| {
+            api_handler(r, patch_tenant_config_handler)
+        })
         .put("/v1/tenant/config", |r| {
             api_handler(r, update_tenant_config_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 54fa95fc47..92078e4b08 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -68,6 +68,7 @@ use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
 use utils::timeout::timeout_cancellable;
 use utils::timeout::TimeoutCancellableError;
+use utils::try_rcu::ArcSwapExt;
 use utils::zstd::create_zst_tarball;
 use utils::zstd::extract_zst_tarball;
 
@@ -3921,25 +3922,28 @@ impl Tenant {
         }
     }
 
-    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
+    pub fn update_tenant_config<F: Fn(TenantConfOpt) -> anyhow::Result<TenantConfOpt>>(
+        &self,
+        update: F,
+    ) -> anyhow::Result<TenantConfOpt> {
         // Use read-copy-update in order to avoid overwriting the location config
         // state if this races with [`Tenant::set_new_location_config`]. Note that
         // this race is not possible if both request types come from the storage
         // controller (as they should!) because an exclusive op lock is required
         // on the storage controller side.
 
-        self.tenant_conf.rcu(|inner| {
-            Arc::new(AttachedTenantConf {
-                tenant_conf: new_tenant_conf.clone(),
-                location: inner.location,
-                // Attached location is not changed, no need to update lsn lease deadline.
-                lsn_lease_deadline: inner.lsn_lease_deadline,
-            })
-        });
+        self.tenant_conf
+            .try_rcu(|attached_conf| -> Result<_, anyhow::Error> {
+                Ok(Arc::new(AttachedTenantConf {
+                    tenant_conf: update(attached_conf.tenant_conf.clone())?,
+                    location: attached_conf.location,
+                    lsn_lease_deadline: attached_conf.lsn_lease_deadline,
+                }))
+            })?;
 
-        let updated = self.tenant_conf.load().clone();
+        let updated = self.tenant_conf.load();
 
-        self.tenant_conf_updated(&new_tenant_conf);
+        self.tenant_conf_updated(&updated.tenant_conf);
         // Don't hold self.timelines.lock() during the notifies.
         // There's no risk of deadlock right now, but there could be if we consolidate
         // mutexes in struct Timeline in the future.
@@ -3947,6 +3951,8 @@ impl Tenant {
         for timeline in timelines {
             timeline.tenant_conf_updated(&updated);
         }
+
+        Ok(updated.tenant_conf.clone())
     }
 
     pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 5d3ac5a8e3..d54dded778 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -427,6 +427,129 @@ impl TenantConfOpt {
                 .or(global_conf.wal_receiver_protocol_override),
         }
     }
+
+    pub fn apply_patch(self, patch: TenantConfigPatch) -> anyhow::Result<TenantConfOpt> {
+        let Self {
+            mut checkpoint_distance,
+            mut checkpoint_timeout,
+            mut compaction_target_size,
+            mut compaction_period,
+            mut compaction_threshold,
+            mut compaction_algorithm,
+            mut gc_horizon,
+            mut gc_period,
+            mut image_creation_threshold,
+            mut pitr_interval,
+            mut walreceiver_connect_timeout,
+            mut lagging_wal_timeout,
+            mut max_lsn_wal_lag,
+            mut eviction_policy,
+            mut min_resident_size_override,
+            mut evictions_low_residence_duration_metric_threshold,
+            mut heatmap_period,
+            mut lazy_slru_download,
+            mut timeline_get_throttle,
+            mut image_layer_creation_check_threshold,
+            mut lsn_lease_length,
+            mut lsn_lease_length_for_ts,
+            mut timeline_offloading,
+            mut wal_receiver_protocol_override,
+        } = self;
+
+        patch.checkpoint_distance.apply(&mut checkpoint_distance);
+        patch
+            .checkpoint_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut checkpoint_timeout);
+        patch
+            .compaction_target_size
+            .apply(&mut compaction_target_size);
+        patch
+            .compaction_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut compaction_period);
+        patch.compaction_threshold.apply(&mut compaction_threshold);
+        patch.compaction_algorithm.apply(&mut compaction_algorithm);
+        patch.gc_horizon.apply(&mut gc_horizon);
+        patch
+            .gc_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut gc_period);
+        patch
+            .image_creation_threshold
+            .apply(&mut image_creation_threshold);
+        patch
+            .pitr_interval
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut pitr_interval);
+        patch
+            .walreceiver_connect_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut walreceiver_connect_timeout);
+        patch
+            .lagging_wal_timeout
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lagging_wal_timeout);
+        patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
+        patch.eviction_policy.apply(&mut eviction_policy);
+        patch
+            .min_resident_size_override
+            .apply(&mut min_resident_size_override);
+        patch
+            .evictions_low_residence_duration_metric_threshold
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut evictions_low_residence_duration_metric_threshold);
+        patch
+            .heatmap_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut heatmap_period);
+        patch.lazy_slru_download.apply(&mut lazy_slru_download);
+        patch
+            .timeline_get_throttle
+            .apply(&mut timeline_get_throttle);
+        patch
+            .image_layer_creation_check_threshold
+            .apply(&mut image_layer_creation_check_threshold);
+        patch
+            .lsn_lease_length
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length);
+        patch
+            .lsn_lease_length_for_ts
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut lsn_lease_length_for_ts);
+        patch.timeline_offloading.apply(&mut timeline_offloading);
+        patch
+            .wal_receiver_protocol_override
+            .apply(&mut wal_receiver_protocol_override);
+
+        Ok(Self {
+            checkpoint_distance,
+            checkpoint_timeout,
+            compaction_target_size,
+            compaction_period,
+            compaction_threshold,
+            compaction_algorithm,
+            gc_horizon,
+            gc_period,
+            image_creation_threshold,
+            pitr_interval,
+            walreceiver_connect_timeout,
+            lagging_wal_timeout,
+            max_lsn_wal_lag,
+            eviction_policy,
+            min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold,
+            heatmap_period,
+            lazy_slru_download,
+            timeline_get_throttle,
+            image_layer_creation_check_threshold,
+            lsn_lease_length,
+            lsn_lease_length_for_ts,
+            timeline_offloading,
+            wal_receiver_protocol_override,
+        })
+    }
 }
 
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 39e078ba7c..dce5380aa0 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -18,8 +18,9 @@ use pageserver_api::controller_api::{
     ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
-    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
-    TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest,
+    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
+    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
+    TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::{mgmt_api, BlockUnblock};
@@ -208,6 +209,27 @@ async fn handle_tenant_location_config(
     )
 }
 
+async fn handle_tenant_config_patch(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::PageServerApi)?;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let config_req = json_request::<TenantConfigPatchRequest>(&mut req).await?;
+
+    json_response(
+        StatusCode::OK,
+        service.tenant_config_patch(config_req).await?,
+    )
+}
+
 async fn handle_tenant_config_set(
     service: Arc<Service>,
     req: Request<Body>,
@@ -1863,6 +1885,13 @@ pub fn make_router(
         .delete("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
         })
+        .patch("/v1/tenant/config", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_config_patch,
+                RequestName("v1_tenant_config"),
+            )
+        })
         .put("/v1/tenant/config", |r| {
             tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
         })
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 7e4ee53b4c..e82e84fe89 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -52,8 +52,8 @@ use pageserver_api::{
         TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
-        SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
-        TopTenantShardsRequest,
+        SecondaryProgress, TenantConfigPatchRequest, TenantConfigRequest,
+        TimelineArchivalConfigRequest, TopTenantShardsRequest,
     },
 };
 use reqwest::StatusCode;
@@ -139,6 +139,7 @@ enum TenantOperations {
     Create,
     LocationConfig,
     ConfigSet,
+    ConfigPatch,
     TimeTravelRemoteStorage,
     Delete,
     UpdatePolicy,
@@ -2602,6 +2603,55 @@ impl Service {
         Ok(result)
     }
 
+    pub(crate) async fn tenant_config_patch(
+        &self,
+        req: TenantConfigPatchRequest,
+    ) -> Result<(), ApiError> {
+        let _tenant_lock = trace_exclusive_lock(
+            &self.tenant_op_locks,
+            req.tenant_id,
+            TenantOperations::ConfigPatch,
+        )
+        .await;
+
+        let tenant_id = req.tenant_id;
+        let patch = req.config;
+
+        let base = {
+            let locked = self.inner.read().unwrap();
+            let shards = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+
+            let mut configs = shards.map(|(_sid, shard)| &shard.config).peekable();
+
+            let first = match configs.peek() {
+                Some(first) => (*first).clone(),
+                None => {
+                    return Err(ApiError::NotFound(
+                        anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+                    ));
+                }
+            };
+
+            if !configs.all_equal() {
+                tracing::error!("Tenant configs for {} are mismatched. ", req.tenant_id);
+                // This can't happen because we atomically update the database records
+                // of all shards to the new value in [`Self::set_tenant_config_and_reconcile`].
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                    "Tenant configs for {} are mismatched",
+                    req.tenant_id
+                )));
+            }
+
+            first
+        };
+
+        let updated_config = base.apply_patch(patch);
+        self.set_tenant_config_and_reconcile(tenant_id, updated_config)
+            .await
+    }
+
     pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
         // We require an exclusive lock, because we are updating persistent and in-memory state
         let _tenant_lock = trace_exclusive_lock(
@@ -2611,12 +2661,32 @@ impl Service {
         )
         .await;
 
-        let tenant_id = req.tenant_id;
-        let config = req.config;
+        let tenant_exists = {
+            let locked = self.inner.read().unwrap();
+            let mut r = locked
+                .tenants
+                .range(TenantShardId::tenant_range(req.tenant_id));
+            r.next().is_some()
+        };
 
+        if !tenant_exists {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {} not found", req.tenant_id).into(),
+            ));
+        }
+
+        self.set_tenant_config_and_reconcile(req.tenant_id, req.config)
+            .await
+    }
+
+    async fn set_tenant_config_and_reconcile(
+        &self,
+        tenant_id: TenantId,
+        config: TenantConfig,
+    ) -> Result<(), ApiError> {
         self.persistence
             .update_tenant_shard(
-                TenantFilter::Tenant(req.tenant_id),
+                TenantFilter::Tenant(tenant_id),
                 None,
                 Some(config.clone()),
                 None,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0832eac22f..eabdeb1053 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -488,7 +488,20 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
 
-    def patch_tenant_config_client_side(
+    def patch_tenant_config(self, tenant_id: TenantId | TenantShardId, updates: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
+        assert "tenant_id" not in updates.keys()
+        res = self.patch(
+            f"http://localhost:{self.port}/v1/tenant/config",
+            json={**updates, "tenant_id": str(tenant_id)},
+        )
+        self.verbose_error(res)
+
+    def update_tenant_config(
         self,
         tenant_id: TenantId,
         inserts: dict[str, Any] | None = None,
@@ -499,13 +512,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
         See `set_tenant_config` for more information.
         """
-        current = self.tenant_config(tenant_id).tenant_specific_overrides
-        if inserts is not None:
-            current.update(inserts)
-        if removes is not None:
-            for key in removes:
-                del current[key]
-        self.set_tenant_config(tenant_id, current)
+        if inserts is None:
+            inserts = {}
+        if removes is None:
+            removes = []
+
+        patch = inserts | {remove: None for remove in removes}
+        self.patch_tenant_config(tenant_id, patch)
 
     def tenant_size(self, tenant_id: TenantId | TenantShardId) -> int:
         return self.tenant_size_and_modelinputs(tenant_id)[0]
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 954db914b9..7abcdb3838 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -460,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
     assert (
         du_by_timeline[large_tenant] > min_resident_size
     ), "ensure the larger tenant will get a haircut"
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
         small_tenant[0], {"min_resident_size_override": min_resident_size}
     )
-    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().update_tenant_config(
         large_tenant[0], {"min_resident_size_override": min_resident_size}
     )
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 9c9bc5b519..7e99d4b2f2 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -74,7 +74,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
     print_layer_size_histogram(post_ingest)
 
     # since all we have are L0s, we should be getting nice L1s and images out of them now
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         env.initial_tenant,
         {
             "compaction_threshold": 1,
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 8818b40712..5e06a1d47f 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -132,7 +132,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
     ), "sanity check for what above loop is supposed to do"
 
     # create the image layer from the future
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         tenant_id, {"image_creation_threshold": image_creation_threshold}, None
     )
     assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py
index fcae7983f4..e9eee2760e 100644
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -46,7 +46,7 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     for sk in env.safekeepers:
         sk.stop()
 
-    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().update_tenant_config(
         tenant_id, {"compaction_threshold": 3}
     )
     # hit the exit failpoint
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9f74dcccb9..4d1784d45a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1768,7 +1768,7 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     # Modify a tenant's config
     storcon_cli(
         [
-            "tenant-config",
+            "patch-tenant-config",
             "--tenant-id",
             str(env.initial_tenant),
             "--config",
@@ -2403,7 +2403,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
 
     # Make a change to the tenant config to trigger a slow reconcile
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
-    virtual_ps_http.patch_tenant_config_client_side(tid, {"compaction_threshold": 5}, None)
+    virtual_ps_http.update_tenant_config(tid, {"compaction_threshold": 5}, None)
     env.storage_controller.allowed_errors.extend(
         [
             ".*Accepted configuration update but reconciliation failed.*",
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index f8f240cfdc..0c2d535af4 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -3,13 +3,14 @@ from __future__ import annotations
 import json
 from typing import TYPE_CHECKING
 
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload
 
 if TYPE_CHECKING:
@@ -330,3 +331,83 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
     metric = get_metric()
     assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
     assert int(metric.value) == 0, "value resets to default"
+
+
+@run_only_on_default_postgres("Test does not start a compute")
+@pytest.mark.parametrize("ps_managed_by", ["storcon", "cplane"])
+def test_tenant_config_patch(neon_env_builder: NeonEnvBuilder, ps_managed_by: str):
+    """
+    Test tenant config patching (i.e. additive updates)
+
+    The flow is different for storage controller and cplane managed pageserver.
+    1. Storcon managed: /v1/tenant/config request lands on storcon, which generates
+    location_config calls containing the update to the pageserver
+    2. Cplane managed: /v1/tenant/config is called directly on the pageserver
+    """
+
+    def assert_tenant_conf_semantically_equal(lhs, rhs):
+        """
+        Storcon returns None for fields that are not set while the pageserver does not.
+        Compare two tenant's config overrides semantically, by dropping the None values.
+        """
+        lhs = {k: v for k, v in lhs.items() if v is not None}
+        rhs = {k: v for k, v in rhs.items() if v is not None}
+
+        assert lhs == rhs
+
+    env = neon_env_builder.init_start()
+
+    if ps_managed_by == "storcon":
+        api = env.storage_controller.pageserver_api()
+    elif ps_managed_by == "cplane":
+        # Disallow storcon from sending location_configs to the pageserver.
+        # These would overwrite the manually set tenant configs.
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.tenant_policy_update(env.initial_tenant, {"scheduling": "Stop"})
+        env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
+        api = env.pageserver.http_client()
+    else:
+        raise Exception(f"Unexpected value of ps_managed_by param: {ps_managed_by}")
+
+    crnt_tenant_conf = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+
+    patch: dict[str, Any | None] = {
+        "gc_period": "3h",
+        "wal_receiver_protocol_override": {
+            "type": "interpreted",
+            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
+        },
+    }
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        # Check that the config was propagated to the PS.
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    patch = {"gc_period": "5h", "wal_receiver_protocol_override": None}
+    api.patch_tenant_config(env.initial_tenant, patch)
+    tenant_conf_after_patch = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_patch)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_patch, crnt_tenant_conf | patch)
+    crnt_tenant_conf = tenant_conf_after_patch
+
+    put = {"pitr_interval": "1m 1s"}
+    api.set_tenant_config(env.initial_tenant, put)
+    tenant_conf_after_put = api.tenant_config(env.initial_tenant).tenant_specific_overrides
+    if ps_managed_by == "storcon":
+        overrides_on_ps = (
+            env.pageserver.http_client().tenant_config(env.initial_tenant).tenant_specific_overrides
+        )
+        assert_tenant_conf_semantically_equal(overrides_on_ps, tenant_conf_after_put)
+    assert_tenant_conf_semantically_equal(tenant_conf_after_put, put)
+    crnt_tenant_conf = tenant_conf_after_put
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 68e9385035..bedbd84aee 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -81,7 +81,7 @@ def test_threshold_based_eviction(
 
     # create a bunch of L1s, only the least of which will need to be resident
     compaction_threshold = 3  # create L1 layers quickly
-    vps_http.patch_tenant_config_client_side(
+    vps_http.update_tenant_config(
         tenant_id,
         inserts={
             # Disable gc and compaction to avoid on-demand downloads from their side.
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 2c3ee38bae..5234d8278f 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -514,7 +514,7 @@ def test_compaction_induced_by_detaches_in_history(
 
         assert len(delta_layers(branch_timeline_id)) == 5
 
-        env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+        env.storage_controller.pageserver_api().update_tenant_config(
             env.initial_tenant, {"compaction_threshold": 5}, None
         )
 

From e8395807a59478d007180a0e3943b1db937c4899 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 11 Dec 2024 19:43:40 +0000
Subject: [PATCH 112/117] storcon: allow for more concurrency in drain/fill
 operations (#10093)

## Problem

We saw the drain/fill operations not drain fast enough in ap-southeast.

## Summary of changes

These are some quick changes to speed it up:
* double reconcile concurrency - this is now half of the available
reconcile bandwidth
* reduce the waiter polling timeout - this way we can spawn new
reconciliations faster
---
 storage_controller/src/background_node_operations.rs | 2 +-
 storage_controller/src/service.rs                    | 6 ++++--
 test_runner/regress/test_storage_controller.py       | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
index 6f1355eb68..226d4942e7 100644
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 
-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64;
 
 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e82e84fe89..2600500a53 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -100,6 +100,8 @@ use crate::{
 
 use context_iterator::TenantShardContextIterator;
 
+const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
 
@@ -6798,7 +6800,7 @@ impl Service {
             }
 
             waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                 .await;
 
             failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel);
@@ -7051,7 +7053,7 @@ impl Service {
             }
 
             waiters = self
-                .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
+                .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT)
                 .await;
         }
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 4d1784d45a..02da389809 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2136,7 +2136,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env.start()
 
     tenant_count = 10
-    shard_count_per_tenant = 8
+    shard_count_per_tenant = 16
     tenant_ids = []
 
     for _ in range(0, tenant_count):

From 7fa986bc923b6af3c77e888be96123f7ba068313 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 11 Dec 2024 21:10:22 +0100
Subject: [PATCH 113/117] Do tenant manifest validation with index-part
 (#10007)

This adds some validation of invariants that we want to uphold wrt the
tenant manifest and `index_part.json`:

* the data the manifest has about a timeline must match with the data in
`index_part.json`. It might actually change, e.g. when we do reparenting
during detach ancestor, but that requires the timeline to be
unoffloaded, i.e. removed from the manifest.
* any timeline mentioned in index part, must, if present, be archived.
If we unarchive, we first update the tenant manifest to unoffload, and
only then update index part. And one needs to archive before offloading.
* it is legal for timelines to be mentioned in the manifest but have no
`index_part`: this is a temporary state visible during deletion of the
timeline. if the pageserver crashed, an attach of the tenant will clean
the state up.
* it is also legal for offloaded timelines to have an
`ancestor_retain_lsn` of None while having an `ancestor_timeline_id`.
This is for the to-be-added flattening functionality: the plan is to set
former to None if we have flattened a timeline.

follow-up of #9942
part of #8088
---
 storage_scrubber/src/checks.rs                |  45 +++--
 .../src/pageserver_physical_gc.rs             | 156 ++++++++++++++----
 2 files changed, 155 insertions(+), 46 deletions(-)

diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 1b4ff01a17..f759f54d19 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -533,8 +533,9 @@ async fn list_timeline_blobs_impl(
 }
 
 pub(crate) struct RemoteTenantManifestInfo {
-    pub(crate) latest_generation: Option<Generation>,
-    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+    pub(crate) generation: Generation,
+    pub(crate) manifest: TenantManifest,
+    pub(crate) listing_object: ListingObject,
 }
 
 pub(crate) enum ListTenantManifestResult {
@@ -543,7 +544,10 @@ pub(crate) enum ListTenantManifestResult {
         #[allow(dead_code)]
         unknown_keys: Vec<ListingObject>,
     },
-    NoErrors(RemoteTenantManifestInfo),
+    NoErrors {
+        latest_generation: Option<RemoteTenantManifestInfo>,
+        manifests: Vec<(Generation, ListingObject)>,
+    },
 }
 
 /// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
@@ -592,14 +596,6 @@ pub(crate) async fn list_tenant_manifests(
         unknown_keys.push(obj);
     }
 
-    if manifests.is_empty() {
-        tracing::debug!("No manifest for timeline.");
-
-        return Ok(ListTenantManifestResult::WithErrors {
-            errors,
-            unknown_keys,
-        });
-    }
     if !unknown_keys.is_empty() {
         errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
 
@@ -609,6 +605,15 @@ pub(crate) async fn list_tenant_manifests(
         });
     }
 
+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::NoErrors {
+            latest_generation: None,
+            manifests,
+        });
+    }
+
     // Find the manifest with the highest generation
     let (latest_generation, latest_listing_object) = manifests
         .iter()
@@ -616,6 +621,8 @@ pub(crate) async fn list_tenant_manifests(
         .map(|(g, obj)| (*g, obj.clone()))
         .unwrap();
 
+    manifests.retain(|(gen, _obj)| gen != &latest_generation);
+
     let manifest_bytes =
         match download_object_with_retries(remote_client, &latest_listing_object.key).await {
             Ok(bytes) => bytes,
@@ -634,13 +641,15 @@ pub(crate) async fn list_tenant_manifests(
         };
 
     match TenantManifest::from_json_bytes(&manifest_bytes) {
-        Ok(_manifest) => {
-            return Ok(ListTenantManifestResult::NoErrors(
-                RemoteTenantManifestInfo {
-                    latest_generation: Some(latest_generation),
-                    manifests,
-                },
-            ));
+        Ok(manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors {
+                latest_generation: Some(RemoteTenantManifestInfo {
+                    generation: latest_generation,
+                    manifest,
+                    listing_object: latest_listing_object,
+                }),
+                manifests,
+            });
         }
         Err(parse_error) => errors.push((
             latest_listing_object.key.get_path().as_str().to_owned(),
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 20cb9c3633..d19b8a5f91 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -4,11 +4,13 @@ use std::time::Duration;
 
 use crate::checks::{
     list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+    RemoteTenantManifestInfo,
 };
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::OffloadedTimelineManifest;
 use pageserver::tenant::remote_timeline_client::{
     parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
 };
@@ -527,7 +529,7 @@ async fn gc_tenant_manifests(
     target: &RootTarget,
     mode: GcMode,
     tenant_shard_id: TenantShardId,
-) -> anyhow::Result<GcSummary> {
+) -> anyhow::Result<(GcSummary, Option<RemoteTenantManifestInfo>)> {
     let mut gc_summary = GcSummary::default();
     match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
         ListTenantManifestResult::WithErrors {
@@ -537,33 +539,35 @@ async fn gc_tenant_manifests(
             for (_key, error) in errors {
                 tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
             }
+            Ok((gc_summary, None))
         }
-        ListTenantManifestResult::NoErrors(mut manifest_info) => {
-            let Some(latest_gen) = manifest_info.latest_generation else {
-                return Ok(gc_summary);
+        ListTenantManifestResult::NoErrors {
+            latest_generation,
+            mut manifests,
+        } => {
+            let Some(latest_generation) = latest_generation else {
+                return Ok((gc_summary, None));
             };
-            manifest_info
-                .manifests
-                .sort_by_key(|(generation, _obj)| *generation);
+            manifests.sort_by_key(|(generation, _obj)| *generation);
             // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
-            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            let candidates = manifests.iter().rev().skip(2);
             for (_generation, key) in candidates {
                 maybe_delete_tenant_manifest(
                     remote_client,
                     &min_age,
-                    latest_gen,
+                    latest_generation.generation,
                     key,
                     mode,
                     &mut gc_summary,
                 )
                 .instrument(
-                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_generation.generation, %key.key),
                 )
                 .await;
             }
+            Ok((gc_summary, Some(latest_generation)))
         }
     }
-    Ok(gc_summary)
 }
 
 async fn gc_timeline(
@@ -573,6 +577,7 @@ async fn gc_timeline(
     mode: GcMode,
     ttid: TenantShardTimelineId,
     accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+    tenant_manifest_info: Arc<Option<RemoteTenantManifestInfo>>,
 ) -> anyhow::Result<GcSummary> {
     let mut summary = GcSummary::default();
     let data = list_timeline_blobs(remote_client, ttid, target).await?;
@@ -597,6 +602,60 @@ async fn gc_timeline(
         }
     };
 
+    if let Some(tenant_manifest_info) = &*tenant_manifest_info {
+        // TODO: this is O(n^2) in the number of offloaded timelines. Do a hashmap lookup instead.
+        let maybe_offloaded = tenant_manifest_info
+            .manifest
+            .offloaded_timelines
+            .iter()
+            .find(|offloaded_timeline| offloaded_timeline.timeline_id == ttid.timeline_id);
+        if let Some(offloaded) = maybe_offloaded {
+            let warnings = validate_index_part_with_offloaded(index_part, offloaded);
+            let warn = if warnings.is_empty() {
+                false
+            } else {
+                // Verify that the manifest hasn't changed. If it has, a potential racing change could have been cause for our troubles.
+                match list_tenant_manifests(remote_client, ttid.tenant_shard_id, target).await? {
+                    ListTenantManifestResult::WithErrors {
+                        errors,
+                        unknown_keys: _,
+                    } => {
+                        for (_key, error) in errors {
+                            tracing::warn!(%ttid, "list_tenant_manifests in gc_timeline: {error}");
+                        }
+                        true
+                    }
+                    ListTenantManifestResult::NoErrors {
+                        latest_generation,
+                        manifests: _,
+                    } => {
+                        if let Some(new_latest_gen) = latest_generation {
+                            let manifest_changed = (
+                                new_latest_gen.generation,
+                                new_latest_gen.listing_object.last_modified,
+                            ) == (
+                                tenant_manifest_info.generation,
+                                tenant_manifest_info.listing_object.last_modified,
+                            );
+                            if manifest_changed {
+                                tracing::debug!(%ttid, "tenant manifest changed since it was loaded, suppressing {} warnings", warnings.len());
+                            }
+                            manifest_changed
+                        } else {
+                            // The latest generation is gone. This timeline is in the progress of being deleted?
+                            false
+                        }
+                    }
+                }
+            };
+            if warn {
+                for warning in warnings {
+                    tracing::warn!(%ttid, "{}", warning);
+                }
+            }
+        }
+    }
+
     accumulator.lock().unwrap().update(ttid, index_part);
 
     for key in candidates {
@@ -608,6 +667,35 @@ async fn gc_timeline(
     Ok(summary)
 }
 
+fn validate_index_part_with_offloaded(
+    index_part: &IndexPart,
+    offloaded: &OffloadedTimelineManifest,
+) -> Vec<String> {
+    let mut warnings = Vec::new();
+    if let Some(archived_at_index_part) = index_part.archived_at {
+        if archived_at_index_part
+            .signed_duration_since(offloaded.archived_at)
+            .num_seconds()
+            != 0
+        {
+            warnings.push(format!(
+                "index-part archived_at={} differs from manifest archived_at={}",
+                archived_at_index_part, offloaded.archived_at
+            ));
+        }
+    } else {
+        warnings.push("Timeline offloaded in manifest but not archived in index-part".to_string());
+    }
+    if index_part.metadata.ancestor_timeline() != offloaded.ancestor_timeline_id {
+        warnings.push(format!(
+            "index-part anestor={:?} differs from manifest ancestor={:?}",
+            index_part.metadata.ancestor_timeline(),
+            offloaded.ancestor_timeline_id
+        ));
+    }
+    warnings
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
@@ -650,29 +738,38 @@ pub async fn pageserver_physical_gc(
         let target_ref = &target;
         let remote_client_ref = &remote_client;
         async move {
-            let summaries_from_manifests = match gc_tenant_manifests(
+            let gc_manifest_result = gc_tenant_manifests(
                 remote_client_ref,
                 min_age,
                 target_ref,
                 mode,
                 tenant_shard_id,
             )
-            .await
-            {
-                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
-                    gc_summary,
-                ))],
+            .await;
+            let (summary_from_manifest, tenant_manifest_opt) = match gc_manifest_result {
+                Ok((gc_summary, tenant_manifest)) => (gc_summary, tenant_manifest),
                 Err(e) => {
                     tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
-                    Vec::new()
+                    (GcSummary::default(), None)
                 }
             };
+            let tenant_manifest_arc = Arc::new(tenant_manifest_opt);
+            let summary_from_manifest = Ok(GcSummaryOrContent::<(_, _)>::GcSummary(
+                summary_from_manifest,
+            ));
             stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
                 .await
                 .map(|stream| {
                     stream
-                        .map_ok(GcSummaryOrContent::Content)
-                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                        .zip(futures::stream::iter(std::iter::repeat(
+                            tenant_manifest_arc,
+                        )))
+                        .map(|(ttid_res, tenant_manifest_arc)| {
+                            ttid_res.map(move |ttid| {
+                                GcSummaryOrContent::Content((ttid, tenant_manifest_arc))
+                            })
+                        })
+                        .chain(futures::stream::iter([summary_from_manifest].into_iter()))
                 })
         }
     });
@@ -684,14 +781,17 @@ pub async fn pageserver_physical_gc(
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
         let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
-            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
-                &remote_client,
-                &min_age,
-                &target,
-                mode,
-                ttid,
-                &accumulator,
-            )),
+            GcSummaryOrContent::Content((ttid, tenant_manifest_arc)) => {
+                futures::future::Either::Left(gc_timeline(
+                    &remote_client,
+                    &min_age,
+                    &target,
+                    mode,
+                    ttid,
+                    &accumulator,
+                    tenant_manifest_arc,
+                ))
+            }
             GcSummaryOrContent::GcSummary(gc_summary) => {
                 futures::future::Either::Right(futures::future::ok(gc_summary))
             }

From 5126ebbfed85404d89b068288f02765e7141aaf7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 11 Dec 2024 22:37:25 +0100
Subject: [PATCH 114/117] test_runner: bump test_check_visibility_map timeout
 (#10091)

## Problem

`test_check_visibility_map` has been seen to time out in debug tests.

## Summary of changes

Bump the timeout to 10 minutes (test reports indicate 7 minutes is
sufficient).

We don't want to disable the test entirely in debug builds, to exercise
this with debug assertions enabled.

Resolves #10069.
---
 test_runner/regress/test_vm_bits.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 46e90852a6..d9e59c71f4 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import time
 from contextlib import closing
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn
 from fixtures.utils import query_scalar
@@ -294,6 +295,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
     cur.execute("commit transaction")
 
 
+@pytest.mark.timeout(600)  # slow in debug builds
 def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     """
     Runs pgbench across a few databases on a sharded tenant, then performs a visibility map

From b391b29bdc2af059261dc00ecfe13a98cf9f2e54 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 11 Dec 2024 16:21:42 -0600
Subject: [PATCH 115/117] Improve typing in test_runner/fixtures/httpserver.py
 (#10103)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/httpserver.py                | 15 ++++++++-------
 test_runner/regress/test_ddl_forwarding.py        |  4 +++-
 test_runner/regress/test_download_extensions.py   |  6 ++++--
 .../regress/test_pageserver_metric_collection.py  |  6 ++++--
 .../regress/test_proxy_metric_collection.py       |  6 +++++-
 test_runner/regress/test_sharding.py              |  7 +++++--
 test_runner/regress/test_storage_controller.py    |  8 +++++---
 .../regress/test_threshold_based_eviction.py      |  6 +++++-
 8 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/test_runner/fixtures/httpserver.py b/test_runner/fixtures/httpserver.py
index f653fd804c..1f46bb22b2 100644
--- a/test_runner/fixtures/httpserver.py
+++ b/test_runner/fixtures/httpserver.py
@@ -7,24 +7,25 @@ from pytest_httpserver import HTTPServer
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from ssl import SSLContext
 
     from fixtures.port_distributor import PortDistributor
 
-# TODO: mypy fails with:
-#  Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor"  [attr-defined]
-# from fixtures.neon_fixtures import PortDistributor
+    ListenAddress = tuple[str, int]
 
 # compared to the fixtures from pytest_httpserver with same names, these are
 # always function scoped, so you can check and stop the server in tests.
 
 
 @pytest.fixture(scope="function")
-def httpserver_ssl_context():
-    return None
+def httpserver_ssl_context() -> Iterator[SSLContext | None]:
+    yield None
 
 
 @pytest.fixture(scope="function")
-def make_httpserver(httpserver_listen_address, httpserver_ssl_context) -> Iterator[HTTPServer]:
+def make_httpserver(
+    httpserver_listen_address: ListenAddress, httpserver_ssl_context: SSLContext | None
+) -> Iterator[HTTPServer]:
     host, port = httpserver_listen_address
     if not host:
         host = HTTPServer.DEFAULT_LISTEN_HOST
@@ -47,6 +48,6 @@ def httpserver(make_httpserver: HTTPServer) -> Iterator[HTTPServer]:
 
 
 @pytest.fixture(scope="function")
-def httpserver_listen_address(port_distributor: PortDistributor) -> tuple[str, int]:
+def httpserver_listen_address(port_distributor: PortDistributor) -> ListenAddress:
     port = port_distributor.get_port()
     return ("localhost", port)
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 1c5554c379..de44bbcbc8 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -15,6 +15,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any, Self
 
+    from fixtures.httpserver import ListenAddress
+
 
 def handle_db(dbs, roles, operation):
     if operation["op"] == "set":
@@ -120,7 +122,7 @@ class DdlForwardingContext:
 
 @pytest.fixture(scope="function")
 def ddl(
-    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: ListenAddress
 ):
     (host, port) = httpserver_listen_address
     with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index b2e19ad713..f18f4e78bd 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -20,6 +20,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 # use neon_env_builder_local fixture to override the default neon_env_builder fixture
 # and use a test-specific pg_install instead of shared one
@@ -47,8 +49,8 @@ def neon_env_builder_local(
 def test_remote_extensions(
     httpserver: HTTPServer,
     neon_env_builder_local: NeonEnvBuilder,
-    httpserver_listen_address,
-    pg_version,
+    httpserver_listen_address: ListenAddress,
+    pg_version: PgVersion,
 ):
     # setup mock http server
     # that expects request for anon.tar.zst
diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py
index 5ec8357597..aedfdbd210 100644
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -27,6 +27,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 # TODO: collect all of the env setup *AFTER* removal of RemoteStorageKind.NOOP
 
@@ -34,7 +36,7 @@ if TYPE_CHECKING:
 def test_metric_collection(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -195,7 +197,7 @@ def test_metric_collection(
 def test_metric_collection_cleans_up_tempfile(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py
index dd63256388..5ff4a99c51 100644
--- a/test_runner/regress/test_proxy_metric_collection.py
+++ b/test_runner/regress/test_proxy_metric_collection.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 from collections.abc import Iterator
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import pytest
 from fixtures.log_helper import log
@@ -15,6 +16,9 @@ from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 
 def proxy_metrics_handler(request: Request) -> Response:
     if request.json is None:
@@ -38,7 +42,7 @@ def proxy_metrics_handler(request: Request) -> Response:
 def proxy_with_metric_collector(
     port_distributor: PortDistributor,
     neon_binpath: Path,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     test_output_dir: Path,
 ) -> Iterator[NeonProxy]:
     """Neon proxy that routes through link auth and has metric collection enabled."""
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 30abf91d3a..743ab0088b 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pytest
 import requests
@@ -27,6 +27,9 @@ from typing_extensions import override
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 
 def test_sharding_smoke(
     neon_env_builder: NeonEnvBuilder,
@@ -759,7 +762,7 @@ def test_sharding_split_smoke(
 def test_sharding_split_stripe_size(
     neon_env_builder: NeonEnvBuilder,
     httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     initial_stripe_size: int,
 ):
     """
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 02da389809..5f3a7b39d3 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -58,6 +58,8 @@ from werkzeug.wrappers.response import Response
 if TYPE_CHECKING:
     from typing import Any
 
+    from fixtures.httpserver import ListenAddress
+
 
 def get_node_shard_counts(env: NeonEnv, tenant_ids):
     counts: defaultdict[int, int] = defaultdict(int)
@@ -563,7 +565,7 @@ def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
 def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     Test that the sharding service calls out to the configured HTTP endpoint on attachment changes
@@ -681,7 +683,7 @@ NOTIFY_FAILURE_LOGS = [
 def test_storage_controller_stuck_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     Test the migration process's behavior when the compute hook does not enable it to proceed
@@ -818,7 +820,7 @@ def test_storage_controller_stuck_compute_hook(
 def test_storage_controller_compute_hook_revert(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
 ):
     """
     'revert' in the sense of a migration which gets reversed shortly after, as may happen during
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index bedbd84aee..c87b520366 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import time
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -13,12 +14,15 @@ from fixtures.pageserver.http import LayerMapInfo
 from fixtures.remote_storage import RemoteStorageKind
 from pytest_httpserver import HTTPServer
 
+if TYPE_CHECKING:
+    from fixtures.httpserver import ListenAddress
+
 # NB: basic config change tests are in test_tenant_conf.py
 
 
 def test_threshold_based_eviction(
     httpserver: HTTPServer,
-    httpserver_listen_address,
+    httpserver_listen_address: ListenAddress,
     pg_bin: PgBin,
     neon_env_builder: NeonEnvBuilder,
 ):

From 342cbea255aa601079ed01f91f902c8f2da8c3e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 12 Dec 2024 02:09:24 +0100
Subject: [PATCH 116/117] storcon: add safekeeper list API (#10089)

This adds an API to the storage controller to list safekeepers
registered to it.

This PR does a `diesel print-schema > storage_controller/src/schema.rs`
because of an inconsistency between up.sql and schema.rs, introduced by
[this](https://github.com/neondatabase/neon/pull/8879/commits/2c142f14f7ba9bd9a178ae96723d196330eb34ba)
commit, so there is some updates of `schema.rs` due to that. As a
followup to this, we should maybe think about running `diesel
print-schema` in CI.

Part of #9981
---
 storage_controller/src/http.rs                | 41 +++++++++++++++----
 storage_controller/src/persistence.rs         | 17 ++++++++
 storage_controller/src/schema.rs              | 35 +++++++++-------
 storage_controller/src/service.rs             |  6 +++
 test_runner/fixtures/neon_fixtures.py         | 10 +++++
 .../regress/test_storage_controller.py        |  5 +++
 6 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index dce5380aa0..24fd4c341a 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -879,6 +879,21 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
+async fn handle_safekeeper_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Infra)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+    let safekeepers = state.service.safekeepers_list().await?;
+    json_response(StatusCode::OK, safekeepers)
+}
+
 async fn handle_metadata_health_update(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Scrubber)?;
 
@@ -1203,7 +1218,7 @@ impl From<ReconcileError> for ApiError {
 ///
 /// Not used by anything except manual testing.
 async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let id = parse_request_param::<i64>(&req, "id")?;
 
@@ -1221,7 +1236,7 @@ async fn handle_get_safekeeper(req: Request<Body>) -> Result<Response<Body>, Api
     match res {
         Ok(b) => json_response(StatusCode::OK, b),
         Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => {
-            Err(ApiError::NotFound("unknown instance_id".into()))
+            Err(ApiError::NotFound("unknown instance id".into()))
         }
         Err(other) => Err(other.into()),
     }
@@ -1817,6 +1832,21 @@ pub fn make_router(
                 RequestName("control_v1_metadata_health_list_outdated"),
             )
         })
+        // Safekeepers
+        .get("/control/v1/safekeeper", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_list,
+                RequestName("control_v1_safekeeper_list"),
+            )
+        })
+        .get("/control/v1/safekeeper/:id", |r| {
+            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
+        })
+        .post("/control/v1/safekeeper/:id", |r| {
+            // id is in the body
+            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+        })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
             tenant_service_handler(
@@ -1869,13 +1899,6 @@ pub fn make_router(
         .put("/control/v1/step_down", |r| {
             named_request_span(r, handle_step_down, RequestName("control_v1_step_down"))
         })
-        .get("/control/v1/safekeeper/:id", |r| {
-            named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper"))
-        })
-        .post("/control/v1/safekeeper/:id", |r| {
-            // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
-        })
         // Tenant operations
         // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
         // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 7ca80c7dfe..e17fe78d25 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -104,6 +104,7 @@ pub(crate) enum DatabaseOperation {
     ListMetadataHealth,
     ListMetadataHealthUnhealthy,
     ListMetadataHealthOutdated,
+    ListSafekeepers,
     GetLeader,
     UpdateLeader,
     SetPreferredAzs,
@@ -1011,6 +1012,22 @@ impl Persistence {
         Ok(())
     }
 
+    /// At startup, populate the list of nodes which our shards may be placed on
+    pub(crate) async fn list_safekeepers(&self) -> DatabaseResult<Vec<SafekeeperPersistence>> {
+        let safekeepers: Vec<SafekeeperPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::ListNodes,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::safekeepers::table.load::<SafekeeperPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        tracing::info!("list_safekeepers: loaded {} nodes", safekeepers.len());
+
+        Ok(safekeepers)
+    }
+
     pub(crate) async fn safekeeper_get(
         &self,
         id: i64,
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 1717a9369d..9e005ab932 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -29,6 +29,19 @@ diesel::table! {
     }
 }
 
+diesel::table! {
+    safekeepers (id) {
+        id -> Int8,
+        region_id -> Text,
+        version -> Int8,
+        host -> Text,
+        port -> Int4,
+        active -> Bool,
+        http_port -> Int4,
+        availability_zone_id -> Text,
+    }
+}
+
 diesel::table! {
     tenant_shards (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -45,18 +58,10 @@ diesel::table! {
     }
 }
 
-diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,);
-
-diesel::table! {
-    safekeepers {
-        id -> Int8,
-        region_id -> Text,
-        version -> Int8,
-        instance_id -> Text,
-        host -> Text,
-        port -> Int4,
-        active -> Bool,
-        http_port -> Int4,
-        availability_zone_id -> Text,
-    }
-}
+diesel::allow_tables_to_appear_in_same_query!(
+    controllers,
+    metadata_health,
+    nodes,
+    safekeepers,
+    tenant_shards,
+);
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2600500a53..894b67fdc6 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7185,6 +7185,12 @@ impl Service {
         global_observed
     }
 
+    pub(crate) async fn safekeepers_list(
+        &self,
+    ) -> Result<Vec<crate::persistence::SafekeeperPersistence>, DatabaseError> {
+        self.persistence.list_safekeepers().await
+    }
+
     pub(crate) async fn get_safekeeper(
         &self,
         id: i64,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8354432c0c..0ecc324030 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2329,6 +2329,16 @@ class NeonStorageController(MetricsGetter, LogUtils):
                 return None
             raise e
 
+    def get_safekeepers(self) -> list[dict[str, Any]]:
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/safekeeper",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        json = response.json()
+        assert isinstance(json, list)
+        return json
+
     def set_preferred_azs(self, preferred_azs: dict[TenantShardId, str]) -> list[TenantShardId]:
         response = self.request(
             "PUT",
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 5f3a7b39d3..ae9b596a1b 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2955,6 +2955,8 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     assert target.get_safekeeper(fake_id) is None
 
+    assert len(target.get_safekeepers()) == 0
+
     body = {
         "active": True,
         "id": fake_id,
@@ -2972,6 +2974,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     inserted = target.get_safekeeper(fake_id)
     assert inserted is not None
+    assert target.get_safekeepers() == [inserted]
     assert eq_safekeeper_records(body, inserted)
 
     # error out if pk is changed (unexpected)
@@ -2983,6 +2986,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     assert exc.value.status_code == 400
 
     inserted_again = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_again]
     assert inserted_again is not None
     assert eq_safekeeper_records(inserted, inserted_again)
 
@@ -2991,6 +2995,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     body["version"] += 1
     target.on_safekeeper_deploy(fake_id, body)
     inserted_now = target.get_safekeeper(fake_id)
+    assert target.get_safekeepers() == [inserted_now]
     assert inserted_now is not None
 
     assert eq_safekeeper_records(body, inserted_now)

From def05700d56bc23d0b805abe248644540ab18ad8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 12 Dec 2024 06:02:08 +0000
Subject: [PATCH 117/117] Proxy release 2024-12-12