proxy: merge connect compute (#4713)

## Problem Half of #4699. TCP/WS have one implementation of `connect_to_compute`, HTTP has another implementation of `connect_to_compute`. Having both is annoying to deal with. ## Summary of changes Creates a set of traits `ConnectMechanism` and `ShouldError` that allows the `connect_to_compute` to be generic over raw TCP stream or tokio_postgres based connections. I'm not super happy with this. I think it would be nice to remove tokio_postgres entirely but that will need a lot more thought to be put into it. I have also slightly refactored the caching to use fewer references. Instead using ownership to ensure the state of retrying is encoded in the type system.
2026-06-02 13:00:37 +00:00 · 2023-07-17 15:53:01 +01:00
parent 1066bca5e3
commit 7c85c7ea91
7 changed files with 215 additions and 176 deletions
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,19 +1,17 @@
+use anyhow::Context;
+use async_trait::async_trait;
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
-use std::ops::ControlFlow;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;

-use crate::config;
 use crate::{auth, console};
+use crate::{compute, config};

 use super::sql_over_http::MAX_RESPONSE_SIZE;

-use crate::proxy::{
-    can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
-    NUM_RETRIES_WAKE_COMPUTE,
-};
+use crate::proxy::ConnectMechanism;

 use tracing::error;
 use tracing::info;
@@ -187,6 +185,27 @@ impl GlobalConnPool {
    }
 }

+struct TokioMechanism<'a> {
+    conn_info: &'a ConnInfo,
+}
+
+#[async_trait]
+impl ConnectMechanism for TokioMechanism<'_> {
+    type Connection = tokio_postgres::Client;
+    type ConnectError = tokio_postgres::Error;
+    type Error = anyhow::Error;
+
+    async fn connect_once(
+        &self,
+        node_info: &console::CachedNodeInfo,
+        timeout: time::Duration,
+    ) -> Result<Self::Connection, Self::ConnectError> {
+        connect_to_compute_once(node_info, self.conn_info, timeout).await
+    }
+
+    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
+}
+
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
@@ -220,72 +239,18 @@ async fn connect_to_compute(
        application_name: Some(APP_NAME),
    };

-    let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
+    let node_info = creds
+        .wake_compute(&extra)
+        .await?
+        .context("missing cache entry from wake_compute")?;

-    let mut num_retries = 0;
-    let mut wait_duration = time::Duration::ZERO;
-    let mut should_wake_with_error = None;
-    loop {
-        if !wait_duration.is_zero() {
-            time::sleep(wait_duration).await;
-        }
-
-        // try wake the compute node if we have determined it's sensible to do so
-        if let Some(err) = should_wake_with_error.take() {
-            match try_wake(node_info, &extra, &creds).await {
-                // we can't wake up the compute node
-                Ok(None) => return Err(err),
-                // there was an error communicating with the control plane
-                Err(e) => return Err(e.into()),
-                // failed to wake up but we can continue to retry
-                Ok(Some(ControlFlow::Continue(()))) => {
-                    wait_duration = retry_after(num_retries);
-                    should_wake_with_error = Some(err);
-
-                    num_retries += 1;
-                    info!(num_retries, "retrying wake compute");
-                    continue;
-                }
-                // successfully woke up a compute node and can break the wakeup loop
-                Ok(Some(ControlFlow::Break(()))) => {}
-            }
-        }
-
-        match connect_to_compute_once(node_info, conn_info).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !can_retry_error(&e, num_retries) {
-                    return Err(e.into());
-                }
-                wait_duration = retry_after(num_retries);
-
-                // after the first connect failure,
-                // we should invalidate the cache and wake up a new compute node
-                if num_retries == 0 {
-                    invalidate_cache(node_info);
-                    should_wake_with_error = Some(e.into());
-                }
-            }
-        }
-
-        num_retries += 1;
-        info!(num_retries, "retrying connect");
-    }
-}
-
-fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
-    match err {
-        // retry all errors at least once
-        _ if num_retries == 0 => true,
-        _ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
-        err => can_retry_tokio_postgres_error(err),
-    }
+    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
+    timeout: time::Duration,
 ) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

@@ -294,6 +259,7 @@ async fn connect_to_compute_once(
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect_timeout(timeout)
        .connect(tokio_postgres::NoTls)
        .await?;