Keep the conn info cache on max_client_conn from pgbouncer (#11986)

## Problem Hitting max_client_conn from pgbouncer would lead to invalidation of the conn info cache. Customers would hit the limit on wake_compute. ## Summary of changes `should_retry_wake_compute` detects this specific error from pgbouncer as non-retriable, meaning we won't try to wake up the compute again.
2026-01-04 03:52:56 +00:00 · 2025-05-21 17:27:30 +02:00
parent 136cf1979b
commit 5db20af8a7
6 changed files with 240 additions and 4 deletions
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -127,3 +127,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
+tracing-test = "0.2"
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
        use postgres_client::error::SqlState;
        // Here are errors that happens after the user successfully authenticated to the database.
        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
-        !matches!(
+        let non_retriable_pg_errors = matches!(
            self.code(),
            &SqlState::TOO_MANY_CONNECTIONS
                | &SqlState::OUT_OF_MEMORY
@@ -56,8 +56,20 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
                | &SqlState::T_R_SERIALIZATION_FAILURE
                | &SqlState::INVALID_CATALOG_NAME
                | &SqlState::INVALID_SCHEMA_NAME
-                | &SqlState::INVALID_PARAMETER_VALUE
-        )
+                | &SqlState::INVALID_PARAMETER_VALUE,
+        );
+        if non_retriable_pg_errors {
+            return false;
+        }
+        // PGBouncer errors that should not trigger a wake_compute retry.
+        if self.code() == &SqlState::PROTOCOL_VIOLATION {
+            // Source for the error message:
+            // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
+            return !self
+                .message()
+                .contains("no more connections allowed (max_client_conn)");
+        }
+        true
    }
 }

@@ -110,3 +122,55 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
        .base_delay
        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::ShouldRetryWakeCompute;
+    use postgres_client::error::{DbError, SqlState};
+
+    #[test]
+    fn should_retry_wake_compute_for_db_error() {
+        // These SQLStates should NOT trigger a wake_compute retry.
+        let non_retry_states = [
+            SqlState::TOO_MANY_CONNECTIONS,
+            SqlState::OUT_OF_MEMORY,
+            SqlState::SYNTAX_ERROR,
+            SqlState::T_R_SERIALIZATION_FAILURE,
+            SqlState::INVALID_CATALOG_NAME,
+            SqlState::INVALID_SCHEMA_NAME,
+            SqlState::INVALID_PARAMETER_VALUE,
+        ];
+        for state in non_retry_states {
+            let err = DbError::new_test_error(state.clone(), "oops".to_string());
+            assert!(
+                !err.should_retry_wake_compute(),
+                "State {state:?} unexpectedly retried"
+            );
+        }
+
+        // Errors coming from pgbouncer should not trigger a wake_compute retry
+        let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
+        for error in non_retry_pgbouncer_errors {
+            let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
+            assert!(
+                !err.should_retry_wake_compute(),
+                "PGBouncer error {error:?} unexpectedly retried"
+            );
+        }
+
+        // These SQLStates should trigger a wake_compute retry.
+        let retry_states = [
+            SqlState::CONNECTION_FAILURE,
+            SqlState::CONNECTION_EXCEPTION,
+            SqlState::CONNECTION_DOES_NOT_EXIST,
+            SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ];
+        for state in retry_states {
+            let err = DbError::new_test_error(state.clone(), "oops".to_string());
+            assert!(
+                err.should_retry_wake_compute(),
+                "State {state:?} unexpectedly skipped retry"
+            );
+        }
+    }
+}
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -15,6 +15,7 @@ use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
+use tracing_test::traced_test;

 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -381,8 +382,14 @@ enum ConnectAction {
    WakeFail,
    WakeRetry,
    Connect,
+    // connect_once -> Err, could_retry = true, should_retry_wake_compute = true
    Retry,
+    // connect_once -> Err, could_retry = true, should_retry_wake_compute = false
+    RetryNoWake,
+    // connect_once -> Err, could_retry = false, should_retry_wake_compute = true
    Fail,
+    // connect_once -> Err, could_retry = false, should_retry_wake_compute = false
+    FailNoWake,
 }

 #[derive(Clone)]
@@ -424,6 +431,7 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
    retryable: bool,
+    wakeable: bool,
    kind: crate::error::ErrorKind,
 }

@@ -448,7 +456,7 @@ impl CouldRetry for TestConnectError {
 }
 impl ShouldRetryWakeCompute for TestConnectError {
    fn should_retry_wake_compute(&self) -> bool {
-        true
+        self.wakeable
    }
 }

@@ -471,10 +479,22 @@ impl ConnectMechanism for TestConnectMechanism {
            ConnectAction::Connect => Ok(TestConnection),
            ConnectAction::Retry => Err(TestConnectError {
                retryable: true,
+                wakeable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::RetryNoWake => Err(TestConnectError {
+                retryable: true,
+                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            ConnectAction::Fail => Err(TestConnectError {
                retryable: false,
+                wakeable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailNoWake => Err(TestConnectError {
+                retryable: false,
+                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            x => panic!("expecting action {x:?}, connect is called instead"),
@@ -709,3 +729,92 @@ async fn wake_non_retry() {
        .unwrap_err();
    mechanism.verify();
 }
+
+#[tokio::test]
+#[traced_test]
+async fn fail_but_wake_invalidates_cache() {
+    let ctx = RequestContext::test();
+    let mech = TestConnectMechanism::new(vec![
+        ConnectAction::Wake,
+        ConnectAction::Fail,
+        ConnectAction::Wake,
+        ConnectAction::Connect,
+    ]);
+    let user = helper_create_connect_info(&mech);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
+        .await
+        .unwrap();
+
+    assert!(logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn fail_no_wake_skips_cache_invalidation() {
+    let ctx = RequestContext::test();
+    let mech = TestConnectMechanism::new(vec![
+        ConnectAction::Wake,
+        ConnectAction::FailNoWake,
+        ConnectAction::Connect,
+    ]);
+    let user = helper_create_connect_info(&mech);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
+        .await
+        .unwrap();
+
+    assert!(!logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn retry_but_wake_invalidates_cache() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+
+    let ctx = RequestContext::test();
+    // Wake → Retry (retryable + wakeable) → Wake → Connect
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
+        .await
+        .unwrap();
+    mechanism.verify();
+
+    // Because Retry has wakeable=true, we should see invalidate_cache
+    assert!(logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn retry_no_wake_skips_invalidation() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+
+    let ctx = RequestContext::test();
+    // Wake → RetryNoWake (retryable + NOT wakeable)
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
+    let user_info = helper_create_connect_info(&mechanism);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+
+    // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
+    assert!(!logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}