diff --git a/config/config.md b/config/config.md
index 5369f14cd9..5600b56c60 100644
--- a/config/config.md
+++ b/config/config.md
@@ -325,6 +325,7 @@
| `selector` | String | `round_robin` | Datanode selector type. - `round_robin` (default value) - `lease_based` - `load_based` For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
| `use_memory_store` | Bool | `false` | Store data in memory. |
| `enable_region_failover` | Bool | `false` | Whether to enable region failover. This feature is only available on GreptimeDB running on cluster mode and - Using Remote WAL - Using shared storage (e.g., s3). |
+| `region_failure_detector_initialization_delay` | String | `10m` | Delay before initializing region failure detectors. This delay helps prevent premature initialization of region failure detectors in cases where cluster maintenance mode is enabled right after metasrv starts, especially when the cluster is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration may trigger unnecessary region failovers during datanode startup. |
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL. **This option is not recommended to be set to true, because it may lead to data loss during failover.** |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index d62ed6c115..30957fb914 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -43,6 +43,13 @@ use_memory_store = false
## - Using shared storage (e.g., s3).
enable_region_failover = false
+## Delay before initializing region failure detectors.
+## This delay helps prevent premature initialization of region failure detectors in cases where
+## cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
+## is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
+## may trigger unnecessary region failovers during datanode startup.
+region_failure_detector_initialization_delay = '10m'
+
## Whether to allow region failover on local WAL.
## **This option is not recommended to be set to true, because it may lead to data loss during failover.**
allow_region_failover_on_local_wal = false
diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs
index 94d2a0bf07..dbf87adf2f 100644
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -48,6 +48,11 @@ impl TableRouteKey {
pub fn new(table_id: TableId) -> Self {
Self { table_id }
}
+
+ /// Returns the range prefix of the table route key.
+ pub fn range_prefix() -> Vec {
+ format!("{}/", TABLE_ROUTE_PREFIX).into_bytes()
+ }
}
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs
index b1ea799e5f..7abf5193f6 100644
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -54,14 +54,6 @@ pub enum Error {
peer_id: u64,
},
- #[snafu(display("Failed to lookup peer: {}", peer_id))]
- LookupPeer {
- #[snafu(implicit)]
- location: Location,
- source: common_meta::error::Error,
- peer_id: u64,
- },
-
#[snafu(display("Another migration procedure is running for region: {}", region_id))]
MigrationRunning {
#[snafu(implicit)]
@@ -1033,7 +1025,6 @@ impl ErrorExt for Error {
}
Error::Other { source, .. } => source.status_code(),
- Error::LookupPeer { source, .. } => source.status_code(),
Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted,
#[cfg(feature = "pg_kvbackend")]
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 50797d44c4..91d0b22caf 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -110,6 +110,14 @@ pub struct MetasrvOptions {
pub use_memory_store: bool,
/// Whether to enable region failover.
pub enable_region_failover: bool,
+ /// Delay before initializing region failure detectors.
+ ///
+ /// This delay helps prevent premature initialization of region failure detectors in cases where
+ /// cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
+ /// is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
+ /// may trigger unnecessary region failovers during datanode startup.
+ #[serde(with = "humantime_serde")]
+ pub region_failure_detector_initialization_delay: Duration,
/// Whether to allow region failover on local WAL.
///
/// If it's true, the region failover will be allowed even if the local WAL is used.
@@ -219,6 +227,7 @@ impl Default for MetasrvOptions {
selector: SelectorType::default(),
use_memory_store: false,
enable_region_failover: false,
+ region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
allow_region_failover_on_local_wal: false,
grpc: GrpcOptions {
bind_addr: format!("127.0.0.1:{}", DEFAULT_METASRV_ADDR_PORT),
diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs
index 167c5afd8e..85e50c8669 100644
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -64,7 +64,7 @@ use crate::procedure::wal_prune::manager::{WalPruneManager, WalPruneTicker};
use crate::procedure::wal_prune::Context as WalPruneContext;
use crate::region::supervisor::{
HeartbeatAcceptor, RegionFailureDetectorControl, RegionSupervisor, RegionSupervisorSelector,
- RegionSupervisorTicker, DEFAULT_TICK_INTERVAL,
+ RegionSupervisorTicker, DEFAULT_INITIALIZATION_RETRY_PERIOD, DEFAULT_TICK_INTERVAL,
};
use crate::selector::lease_based::LeaseBasedSelector;
use crate::selector::round_robin::RoundRobinSelector;
@@ -299,6 +299,8 @@ impl MetasrvBuilder {
Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _,
Some(Arc::new(RegionSupervisorTicker::new(
DEFAULT_TICK_INTERVAL,
+ options.region_failure_detector_initialization_delay,
+ DEFAULT_INITIALIZATION_RETRY_PERIOD,
tx.clone(),
))),
)
@@ -341,6 +343,7 @@ impl MetasrvBuilder {
region_migration_manager.clone(),
maintenance_mode_manager.clone(),
peer_lookup_service.clone(),
+ leader_cached_kv_backend.clone(),
);
Some(RegionFailureHandler::new(
diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs
index b277bd3e23..adf6c0732b 100644
--- a/src/meta-srv/src/procedure/region_migration/manager.rs
+++ b/src/meta-srv/src/procedure/region_migration/manager.rs
@@ -23,7 +23,7 @@ use common_meta::key::table_route::TableRouteValue;
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionRoute;
use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId};
-use common_telemetry::{error, info};
+use common_telemetry::{error, info, warn};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use table::table_name::TableName;
@@ -253,10 +253,12 @@ impl RegionMigrationManager {
}
/// Throws an error if `leader_peer` is not the `from_peer`.
+ ///
+ /// If `from_peer` is unknown, use the leader peer as the `from_peer`.
fn verify_region_leader_peer(
&self,
region_route: &RegionRoute,
- task: &RegionMigrationProcedureTask,
+ task: &mut RegionMigrationProcedureTask,
) -> Result<()> {
let leader_peer = region_route
.leader_peer
@@ -275,6 +277,15 @@ impl RegionMigrationManager {
}
);
+ if task.from_peer.addr.is_empty() {
+ warn!(
+ "The `from_peer` is unknown, use the leader peer({}) as the `from_peer`, region: {}",
+ leader_peer, task.region_id
+ );
+ // The peer id is the same as the leader peer id.
+ task.from_peer = leader_peer.clone();
+ }
+
Ok(())
}
@@ -300,7 +311,7 @@ impl RegionMigrationManager {
/// Submits a new region migration procedure.
pub async fn submit_procedure(
&self,
- task: RegionMigrationProcedureTask,
+ mut task: RegionMigrationProcedureTask,
) -> Result