fix: increase timeout for opening candidate region and log elapsed time (#5627)

This commit is contained in:
Weny Xu
2025-03-03 17:16:45 +08:00
committed by GitHub
parent a693583a97
commit 95d7ca5382
2 changed files with 30 additions and 15 deletions

View File

@@ -160,12 +160,18 @@ impl DowngradeLeaderRegion {
})?;
let ch = Channel::Datanode(leader.id);
let now = Instant::now();
let receiver = ctx.mailbox.send(&ch, msg, operation_timeout).await?;
match receiver.await? {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!("Downgrade region reply: {:?}", reply);
info!(
"Received downgrade region reply: {:?}, region: {}, elapsed: {:?}",
reply,
region_id,
now.elapsed()
);
let InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id,
exists,
@@ -182,8 +188,8 @@ impl DowngradeLeaderRegion {
if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
"Failed to downgrade the region {} on Datanode {:?}, error: {:?}",
region_id, leader, error
"Failed to downgrade the region {} on Datanode {:?}, error: {:?}, elapsed: {:?}",
region_id, leader, error, now.elapsed()
),
}
.fail();
@@ -191,13 +197,15 @@ impl DowngradeLeaderRegion {
if !exists {
warn!(
"Trying to downgrade the region {} on Datanode {}, but region doesn't exist!",
region_id, leader
"Trying to downgrade the region {} on Datanode {}, but region doesn't exist!, elapsed: {:?}",
region_id, leader, now.elapsed()
);
} else {
info!(
"Region {} leader is downgraded, last_entry_id: {:?}",
region_id, last_entry_id
"Region {} leader is downgraded, last_entry_id: {:?}, elapsed: {:?}",
region_id,
last_entry_id,
now.elapsed()
);
}
@@ -209,8 +217,9 @@ impl DowngradeLeaderRegion {
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}",
"Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}, elapsed: {:?}",
leader,
now.elapsed()
);
error::RetryLaterSnafu { reason }.fail()
}

View File

@@ -13,10 +13,10 @@
// limitations under the License.
use std::any::Any;
use std::time::Duration;
use std::time::{Duration, Instant};
use api::v1::meta::MailboxMessage;
use common_meta::distributed_time_constants::MAILBOX_RTT_SECS;
use common_meta::distributed_time_constants::REGION_LEASE_SECS;
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::key::datanode_table::RegionInfo;
use common_meta::RegionIdent;
@@ -31,7 +31,8 @@ use crate::procedure::region_migration::update_metadata::UpdateMetadata;
use crate::procedure::region_migration::{Context, State};
use crate::service::mailbox::Channel;
const OPEN_CANDIDATE_REGION_TIMEOUT: Duration = Duration::from_secs(MAILBOX_RTT_SECS);
/// Uses lease time of a region as the timeout of opening a candidate region.
const OPEN_CANDIDATE_REGION_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
#[derive(Debug, Serialize, Deserialize)]
pub struct OpenCandidateRegion;
@@ -137,6 +138,7 @@ impl OpenCandidateRegion {
})?;
let ch = Channel::Datanode(candidate.id);
let now = Instant::now();
let receiver = ctx
.mailbox
.send(&ch, msg, OPEN_CANDIDATE_REGION_TIMEOUT)
@@ -146,8 +148,10 @@ impl OpenCandidateRegion {
Ok(msg) => {
let reply = HeartbeatMailbox::json_reply(&msg)?;
info!(
"Received open region reply: {:?}, region: {}",
reply, region_id
"Received open region reply: {:?}, region: {}, elapsed: {:?}",
reply,
region_id,
now.elapsed()
);
let InstructionReply::OpenRegion(SimpleReply { result, error }) = reply else {
return error::UnexpectedInstructionReplySnafu {
@@ -162,8 +166,9 @@ impl OpenCandidateRegion {
} else {
error::RetryLaterSnafu {
reason: format!(
"Region {region_id} is not opened by datanode {:?}, error: {error:?}",
"Region {region_id} is not opened by datanode {:?}, error: {error:?}, elapsed: {:?}",
candidate,
now.elapsed()
),
}
.fail()
@@ -171,8 +176,9 @@ impl OpenCandidateRegion {
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for open candidate region {region_id} on datanode {:?}",
"Mailbox received timeout for open candidate region {region_id} on datanode {:?}, elapsed: {:?}",
candidate,
now.elapsed()
);
error::RetryLaterSnafu { reason }.fail()
}