pageserver: reset the broker subscription if it's been idle for a while (#12436)

## Problem

I suspect that the pageservers get stuck on receiving broker updates.

## Summary of changes

This is a an opportunistic (staging only) patch that resets the
susbscription
stream if it's been idle for a while. This won't go to prod in this
form.
I'll revert or update it before Friday.
This commit is contained in:
Vlad Lazar
2025-07-04 11:25:03 +01:00
committed by GitHub
parent 436a117c15
commit d378726e38
4 changed files with 16 additions and 45 deletions

View File

@@ -100,6 +100,7 @@ pub(super) async fn connection_manager_loop_step(
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
let mut broker_reset_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
debug!("Subscribed for broker timeline updates");
loop {
@@ -156,7 +157,10 @@ pub(super) async fn connection_manager_loop_step(
// Got a new update from the broker
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
match broker_update {
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
Ok(Some(broker_update)) => {
broker_reset_interval.reset();
connection_manager_state.register_timeline_update(broker_update);
},
Err(status) => {
match status.code() {
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
@@ -178,6 +182,14 @@ pub(super) async fn connection_manager_loop_step(
}
},
_ = broker_reset_interval.tick() => {
if wait_lsn_status.borrow().is_some() {
tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
}
broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
},
new_event = async {
// Reminder: this match arm needs to be cancellation-safe.
loop {