From e30753fc31e166af2afbfc585a26fbd284bb8aae Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Thu, 24 Apr 2025 16:11:45 +0800 Subject: [PATCH] feat: allow forced region failover for local WAL (#5972) * feat: allow forced region failover for local WAL * chore: upgrade config.md * chore: apply suggestions from CR --- config/config.md | 1 + config/metasrv.example.toml | 4 ++++ src/meta-srv/src/metasrv.rs | 6 ++++++ src/meta-srv/src/metasrv/builder.rs | 20 ++++++++++++++------ 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/config/config.md b/config/config.md index f34a41d861..f3230190c9 100644 --- a/config/config.md +++ b/config/config.md @@ -319,6 +319,7 @@ | `selector` | String | `round_robin` | Datanode selector type.
- `round_robin` (default value)
- `lease_based`
- `load_based`
For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". | | `use_memory_store` | Bool | `false` | Store data in memory. | | `enable_region_failover` | Bool | `false` | Whether to enable region failover.
This feature is only available on GreptimeDB running on cluster mode and
- Using Remote WAL
- Using shared storage (e.g., s3). | +| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.
**This option is not recommended to be set to true, because it may lead to data loss during failover.** | | `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. | | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. | | `runtime` | -- | -- | The runtime options. | diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml index 89c92352b2..0e7f9b74f0 100644 --- a/config/metasrv.example.toml +++ b/config/metasrv.example.toml @@ -50,6 +50,10 @@ use_memory_store = false ## - Using shared storage (e.g., s3). enable_region_failover = false +## Whether to allow region failover on local WAL. +## **This option is not recommended to be set to true, because it may lead to data loss during failover.** +allow_region_failover_on_local_wal = false + ## Max allowed idle time before removing node info from metasrv memory. node_max_idle_time = "24hours" diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 34b3cac25e..6c9111dd9c 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -111,6 +111,11 @@ pub struct MetasrvOptions { pub use_memory_store: bool, /// Whether to enable region failover. pub enable_region_failover: bool, + /// Whether to allow region failover on local WAL. + /// + /// If it's true, the region failover will be allowed even if the local WAL is used. + /// Note that this option is not recommended to be set to true, because it may lead to data loss during failover. + pub allow_region_failover_on_local_wal: bool, /// The HTTP server options. pub http: HttpOptions, /// The logging options. @@ -173,6 +178,7 @@ impl Default for MetasrvOptions { selector: SelectorType::default(), use_memory_store: false, enable_region_failover: false, + allow_region_failover_on_local_wal: false, http: HttpOptions::default(), logging: LoggingOptions { dir: format!("{METASRV_HOME}/logs"), diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs index 02f835e226..0c93e4e4c7 100644 --- a/src/meta-srv/src/metasrv/builder.rs +++ b/src/meta-srv/src/metasrv/builder.rs @@ -40,7 +40,8 @@ use common_meta::state_store::KvStateStore; use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator}; use common_procedure::local::{LocalManager, ManagerConfig}; use common_procedure::ProcedureManagerRef; -use snafu::ResultExt; +use common_telemetry::warn; +use snafu::{ensure, ResultExt}; use crate::cache_invalidator::MetasrvCacheInvalidator; use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef}; @@ -276,18 +277,25 @@ impl MetasrvBuilder { }, )); let peer_lookup_service = Arc::new(MetaPeerLookupService::new(meta_peer_client.clone())); + if !is_remote_wal && options.enable_region_failover { - return error::UnexpectedSnafu { - violated: "Region failover is not supported in the local WAL implementation!", + ensure!( + options.allow_region_failover_on_local_wal, + error::UnexpectedSnafu { + violated: "Region failover is not supported in the local WAL implementation! + If you want to enable region failover for local WAL, please set `allow_region_failover_on_local_wal` to true.", + } + ); + if options.allow_region_failover_on_local_wal { + warn!("Region failover is force enabled in the local WAL implementation! This may lead to data loss during failover!"); } - .fail(); } let (tx, rx) = RegionSupervisor::channel(); let (region_failure_detector_controller, region_supervisor_ticker): ( RegionFailureDetectorControllerRef, Option>, - ) = if options.enable_region_failover && is_remote_wal { + ) = if options.enable_region_failover { ( Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _, Some(Arc::new(RegionSupervisorTicker::new( @@ -313,7 +321,7 @@ impl MetasrvBuilder { )); region_migration_manager.try_start()?; - let region_failover_handler = if options.enable_region_failover && is_remote_wal { + let region_failover_handler = if options.enable_region_failover { let region_supervisor = RegionSupervisor::new( rx, options.failure_detector,