[proxy] fix connect_to_compute retry handling (#12351)

# Problem In #12335 I moved the `authenticate` method outside of the `connect_to_compute` loop. This triggered [e2e tests to become flaky](https://github.com/neondatabase/cloud/pull/30533). This highlighted an edge case we forgot to consider with that change. When we connect to compute, the compute IP might be cached. This cache hit might however be stale. Because we can't validate the IP is associated with a specific compute-id☨, we will succeed the connect_to_compute operation and fail when it comes to password authentication☨☨. Before the change, we were invalidating the cache and triggering wake_compute if the authentication failed. Additionally, I noticed some faulty logic I introduced 1 year ago https://github.com/neondatabase/neon/pull/8141/files#diff-5491e3afe62d8c5c77178149c665603b29d88d3ec2e47fc1b3bb119a0a970afaL145-R147 ☨ We can when we roll out TLS, as the certificate common name includes the compute-id. ☨☨ Technically password authentication could pass for the wrong compute, but I think this would only happen in the very very rare event that the IP got reused **and** the compute's endpoint happened to be a branch/replica. # Solution 1. Fix the broken logic 2. Simplify cache invalidation (I don't know why it was so convoluted) 3. Add a loop around connect_to_compute + authenticate to re-introduce the wake_compute invalidation we accidentally removed. I went with this approach to try and avoid interfering with https://github.com/neondatabase/neon/compare/main...cloneable/proxy-pglb-connect-compute-split. The changes made in commit 3 will move into `handle_client_request` I suspect,
2026-05-30 19:40:39 +00:00 · 2025-06-27 11:36:27 +01:00
parent 6fa1562b57
commit abc1efd5a6
7 changed files with 137 additions and 72 deletions
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
 ///
 /// * There's an API for immediate invalidation (removal) of a cache entry;
 ///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information.
+///   See [`timed_lru::Cached`] for more information.
 ///
 /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
 ///   or by a successful lookup (i.e. the entry hasn't expired yet).
@@ -54,7 +54,7 @@ pub(crate) struct TimedLru<K, V> {
 impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
    type Key = K;
    type Value = V;
-    type LookupInfo<Key> = LookupInfo<Key>;
+    type LookupInfo<Key> = Key;

    fn invalidate(&self, info: &Self::LookupInfo<K>) {
        self.invalidate_raw(info);
@@ -87,30 +87,24 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {

    /// Drop an entry from the cache if it's outdated.
    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
-    fn invalidate_raw(&self, info: &LookupInfo<K>) {
-        let now = Instant::now();
-
+    fn invalidate_raw(&self, key: &K) {
        // Do costly things before taking the lock.
        let mut cache = self.cache.lock();
-        let raw_entry = match cache.raw_entry_mut().from_key(&info.key) {
+        let entry = match cache.raw_entry_mut().from_key(key) {
            RawEntryMut::Vacant(_) => return,
-            RawEntryMut::Occupied(x) => x,
+            RawEntryMut::Occupied(x) => x.remove(),
        };
-
-        // Remove the entry if it was created prior to lookup timestamp.
-        let entry = raw_entry.get();
-        let (created_at, expires_at) = (entry.created_at, entry.expires_at);
-        let should_remove = created_at <= info.created_at || expires_at <= now;
-
-        if should_remove {
-            raw_entry.remove();
-        }
-
        drop(cache); // drop lock before logging
+
+        let Entry {
+            created_at,
+            expires_at,
+            ..
+        } = entry;
+
        debug!(
-            created_at = format_args!("{created_at:?}"),
-            expires_at = format_args!("{expires_at:?}"),
-            entry_removed = should_remove,
+            ?created_at,
+            ?expires_at,
            "processed a cache entry invalidation event"
        );
    }
@@ -211,10 +205,10 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
    }

    pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
-        let (created_at, old) = self.insert_raw(key.clone(), value);
+        let (_, old) = self.insert_raw(key.clone(), value);

        let cached = Cached {
-            token: Some((self, LookupInfo { created_at, key })),
+            token: Some((self, key)),
            value: (),
        };

@@ -229,28 +223,9 @@ impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
        K: Borrow<Q> + Clone,
        Q: Hash + Eq + ?Sized,
    {
-        self.get_raw(key, |key, entry| {
-            let info = LookupInfo {
-                created_at: entry.created_at,
-                key: key.clone(),
-            };
-
-            Cached {
-                token: Some((self, info)),
-                value: entry.value.clone(),
-            }
+        self.get_raw(key, |key, entry| Cached {
+            token: Some((self, key.clone())),
+            value: entry.value.clone(),
        })
    }
 }
-
-/// Lookup information for key invalidation.
-pub(crate) struct LookupInfo<K> {
-    /// Time of creation of a cache [`Entry`].
-    /// We use this during invalidation lookups to prevent eviction of a newer
-    /// entry sharing the same key (it might've been inserted by a different
-    /// task after we got the entry we're trying to invalidate now).
-    created_at: Instant,
-
-    /// Search by this key.
-    key: K,
-}