mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-22 22:20:02 +00:00
fix: reset cached channel on errors with VIP (#7335)
Signed-off-by: jeremyhi <fengjiachun@gmail.com>
This commit is contained in:
@@ -24,7 +24,7 @@ use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
|
|||||||
use common_telemetry::tracing_context::TracingContext;
|
use common_telemetry::tracing_context::TracingContext;
|
||||||
use common_telemetry::warn;
|
use common_telemetry::warn;
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use snafu::{OptionExt, ResultExt};
|
use snafu::ResultExt;
|
||||||
use tokio::time::timeout;
|
use tokio::time::timeout;
|
||||||
use tonic::transport::Channel;
|
use tonic::transport::Channel;
|
||||||
|
|
||||||
@@ -101,12 +101,14 @@ impl AskLeader {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
|
let (tx, mut rx) = tokio::sync::mpsc::channel(peers.len());
|
||||||
|
let channel_manager = self.channel_manager.clone();
|
||||||
|
|
||||||
for addr in &peers {
|
for addr in &peers {
|
||||||
let mut client = self.create_asker(addr)?;
|
let mut client = self.create_asker(addr)?;
|
||||||
let tx_clone = tx.clone();
|
let tx_clone = tx.clone();
|
||||||
let req = req.clone();
|
let req = req.clone();
|
||||||
let addr = addr.clone();
|
let addr = addr.clone();
|
||||||
|
let channel_manager = channel_manager.clone();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
match client.ask_leader(req).await {
|
match client.ask_leader(req).await {
|
||||||
Ok(res) => {
|
Ok(res) => {
|
||||||
@@ -117,13 +119,19 @@ impl AskLeader {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
Err(status) => {
|
Err(status) => {
|
||||||
|
// Reset cached channel even on generic errors: the VIP may keep us on a dead
|
||||||
|
// backend, so forcing a reconnect gives us a chance to hit a healthy peer.
|
||||||
|
Self::reset_channels_with_manager(
|
||||||
|
&channel_manager,
|
||||||
|
std::slice::from_ref(&addr),
|
||||||
|
);
|
||||||
warn!("Failed to ask leader from: {addr}, {status}");
|
warn!("Failed to ask leader from: {addr}, {status}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let leader = timeout(
|
let leader = match timeout(
|
||||||
self.channel_manager
|
self.channel_manager
|
||||||
.config()
|
.config()
|
||||||
.timeout
|
.timeout
|
||||||
@@ -131,8 +139,16 @@ impl AskLeader {
|
|||||||
rx.recv(),
|
rx.recv(),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.context(error::AskLeaderTimeoutSnafu)?
|
{
|
||||||
.context(error::NoLeaderSnafu)?;
|
Ok(Some(leader)) => leader,
|
||||||
|
Ok(None) => return error::NoLeaderSnafu.fail(),
|
||||||
|
Err(e) => {
|
||||||
|
// All peers timed out. Reset channels to force reconnection,
|
||||||
|
// which may help escape dead backends in VIP/LB scenarios.
|
||||||
|
Self::reset_channels_with_manager(&self.channel_manager, &peers);
|
||||||
|
return Err(e).context(error::AskLeaderTimeoutSnafu);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let mut leadership_group = self.leadership_group.write().unwrap();
|
let mut leadership_group = self.leadership_group.write().unwrap();
|
||||||
leadership_group.leader = Some(leader.clone());
|
leadership_group.leader = Some(leader.clone());
|
||||||
@@ -169,6 +185,15 @@ impl AskLeader {
|
|||||||
.context(error::CreateChannelSnafu)?,
|
.context(error::CreateChannelSnafu)?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Drop cached channels for the given peers so a fresh connection is used next time.
|
||||||
|
fn reset_channels_with_manager(channel_manager: &ChannelManager, peers: &[String]) {
|
||||||
|
if peers.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
channel_manager.retain_channel(|addr, _| !peers.iter().any(|peer| peer == addr));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
|
|||||||
Reference in New Issue
Block a user