feat: allow forced region failover for local WAL (#5972)

* feat: allow forced region failover for local WAL

* chore: upgrade config.md

* chore: apply suggestions from CR
This commit is contained in:
Weny Xu
2025-04-24 16:11:45 +08:00
committed by GitHub
parent b476584f56
commit e30753fc31
4 changed files with 25 additions and 6 deletions

View File

@@ -319,6 +319,7 @@
| `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". | | `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
| `use_memory_store` | Bool | `false` | Store data in memory. | | `use_memory_store` | Bool | `false` | Store data in memory. |
| `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). | | `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. | | `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. | | `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
| `runtime` | -- | -- | The runtime options. | | `runtime` | -- | -- | The runtime options. |

View File

@@ -50,6 +50,10 @@ use_memory_store = false
## - Using shared storage (e.g., s3). ## - Using shared storage (e.g., s3).
enable_region_failover = false enable_region_failover = false
## Whether to allow region failover on local WAL.
## **This option is not recommended to be set to true, because it may lead to data loss during failover.**
allow_region_failover_on_local_wal = false
## Max allowed idle time before removing node info from metasrv memory. ## Max allowed idle time before removing node info from metasrv memory.
node_max_idle_time = "24hours" node_max_idle_time = "24hours"

View File

@@ -111,6 +111,11 @@ pub struct MetasrvOptions {
pub use_memory_store: bool, pub use_memory_store: bool,
/// Whether to enable region failover. /// Whether to enable region failover.
pub enable_region_failover: bool, pub enable_region_failover: bool,
/// Whether to allow region failover on local WAL.
///
/// If it's true, the region failover will be allowed even if the local WAL is used.
/// Note that this option is not recommended to be set to true, because it may lead to data loss during failover.
pub allow_region_failover_on_local_wal: bool,
/// The HTTP server options. /// The HTTP server options.
pub http: HttpOptions, pub http: HttpOptions,
/// The logging options. /// The logging options.
@@ -173,6 +178,7 @@ impl Default for MetasrvOptions {
selector: SelectorType::default(), selector: SelectorType::default(),
use_memory_store: false, use_memory_store: false,
enable_region_failover: false, enable_region_failover: false,
allow_region_failover_on_local_wal: false,
http: HttpOptions::default(), http: HttpOptions::default(),
logging: LoggingOptions { logging: LoggingOptions {
dir: format!("{METASRV_HOME}/logs"), dir: format!("{METASRV_HOME}/logs"),

View File

@@ -40,7 +40,8 @@ use common_meta::state_store::KvStateStore;
use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator}; use common_meta::wal_options_allocator::{build_kafka_client, build_wal_options_allocator};
use common_procedure::local::{LocalManager, ManagerConfig}; use common_procedure::local::{LocalManager, ManagerConfig};
use common_procedure::ProcedureManagerRef; use common_procedure::ProcedureManagerRef;
use snafu::ResultExt; use common_telemetry::warn;
use snafu::{ensure, ResultExt};
use crate::cache_invalidator::MetasrvCacheInvalidator; use crate::cache_invalidator::MetasrvCacheInvalidator;
use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef}; use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
@@ -276,18 +277,25 @@ impl MetasrvBuilder {
}, },
)); ));
let peer_lookup_service = Arc::new(MetaPeerLookupService::new(meta_peer_client.clone())); let peer_lookup_service = Arc::new(MetaPeerLookupService::new(meta_peer_client.clone()));
if !is_remote_wal && options.enable_region_failover { if !is_remote_wal && options.enable_region_failover {
return error::UnexpectedSnafu { ensure!(
violated: "Region failover is not supported in the local WAL implementation!", options.allow_region_failover_on_local_wal,
error::UnexpectedSnafu {
violated: "Region failover is not supported in the local WAL implementation!
If you want to enable region failover for local WAL, please set `allow_region_failover_on_local_wal` to true.",
}
);
if options.allow_region_failover_on_local_wal {
warn!("Region failover is force enabled in the local WAL implementation! This may lead to data loss during failover!");
} }
.fail();
} }
let (tx, rx) = RegionSupervisor::channel(); let (tx, rx) = RegionSupervisor::channel();
let (region_failure_detector_controller, region_supervisor_ticker): ( let (region_failure_detector_controller, region_supervisor_ticker): (
RegionFailureDetectorControllerRef, RegionFailureDetectorControllerRef,
Option<std::sync::Arc<RegionSupervisorTicker>>, Option<std::sync::Arc<RegionSupervisorTicker>>,
) = if options.enable_region_failover && is_remote_wal { ) = if options.enable_region_failover {
( (
Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _, Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _,
Some(Arc::new(RegionSupervisorTicker::new( Some(Arc::new(RegionSupervisorTicker::new(
@@ -313,7 +321,7 @@ impl MetasrvBuilder {
)); ));
region_migration_manager.try_start()?; region_migration_manager.try_start()?;
let region_failover_handler = if options.enable_region_failover && is_remote_wal { let region_failover_handler = if options.enable_region_failover {
let region_supervisor = RegionSupervisor::new( let region_supervisor = RegionSupervisor::new(
rx, rx,
options.failure_detector, options.failure_detector,