From cc99f9d65b214a9e6b27bdd5d54daee337082869 Mon Sep 17 00:00:00 2001 From: Weny Xu Date: Thu, 4 Dec 2025 16:07:45 +0800 Subject: [PATCH] fix: configure HTTP/2 keep-alive for heartbeat client to detect network failures faster (#7344) * fix: configure HTTP/2 keep-alive for heartbeat client to detect network failures faster Signed-off-by: WenyXu * chore: apply suggestions from CR Signed-off-by: WenyXu --------- Signed-off-by: WenyXu --- config/config.md | 3 --- config/datanode.example.toml | 3 --- config/flownode.example.toml | 3 --- config/frontend.example.toml | 3 --- src/cmd/tests/load_config_test.rs | 3 --- src/common/meta/src/distributed_time_constants.rs | 11 +++++++++++ src/meta-client/src/client.rs | 3 +++ src/meta-client/src/lib.rs | 13 +++++++++---- 8 files changed, 23 insertions(+), 19 deletions(-) diff --git a/config/config.md b/config/config.md index 29185c6b58..8b499ca7ee 100644 --- a/config/config.md +++ b/config/config.md @@ -294,7 +294,6 @@ | `meta_client` | -- | -- | The metasrv client options. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.timeout` | String | `3s` | Operation timeout. | -| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | @@ -457,7 +456,6 @@ | `meta_client` | -- | -- | The metasrv client options. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.timeout` | String | `3s` | Operation timeout. | -| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | @@ -629,7 +627,6 @@ | `meta_client` | -- | -- | The metasrv client options. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.timeout` | String | `3s` | Operation timeout. | -| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 8db6bf3d1c..bb769c4625 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -99,9 +99,6 @@ metasrv_addrs = ["127.0.0.1:3002"] ## Operation timeout. timeout = "3s" -## Heartbeat timeout. -heartbeat_timeout = "500ms" - ## DDL timeout. ddl_timeout = "10s" diff --git a/config/flownode.example.toml b/config/flownode.example.toml index 4e44c1ecbb..b13acfc447 100644 --- a/config/flownode.example.toml +++ b/config/flownode.example.toml @@ -78,9 +78,6 @@ metasrv_addrs = ["127.0.0.1:3002"] ## Operation timeout. timeout = "3s" -## Heartbeat timeout. -heartbeat_timeout = "500ms" - ## DDL timeout. ddl_timeout = "10s" diff --git a/config/frontend.example.toml b/config/frontend.example.toml index ecac6cff01..701cb0b087 100644 --- a/config/frontend.example.toml +++ b/config/frontend.example.toml @@ -226,9 +226,6 @@ metasrv_addrs = ["127.0.0.1:3002"] ## Operation timeout. timeout = "3s" -## Heartbeat timeout. -heartbeat_timeout = "500ms" - ## DDL timeout. ddl_timeout = "10s" diff --git a/src/cmd/tests/load_config_test.rs b/src/cmd/tests/load_config_test.rs index 56a6caa71b..79b42dbfc1 100644 --- a/src/cmd/tests/load_config_test.rs +++ b/src/cmd/tests/load_config_test.rs @@ -52,7 +52,6 @@ fn test_load_datanode_example_config() { meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), - heartbeat_timeout: Duration::from_millis(500), ddl_timeout: Duration::from_secs(10), connect_timeout: Duration::from_secs(1), tcp_nodelay: true, @@ -118,7 +117,6 @@ fn test_load_frontend_example_config() { meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), - heartbeat_timeout: Duration::from_millis(500), ddl_timeout: Duration::from_secs(10), connect_timeout: Duration::from_secs(1), tcp_nodelay: true, @@ -241,7 +239,6 @@ fn test_load_flownode_example_config() { meta_client: Some(MetaClientOptions { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_secs(3), - heartbeat_timeout: Duration::from_millis(500), ddl_timeout: Duration::from_secs(10), connect_timeout: Duration::from_secs(1), tcp_nodelay: true, diff --git a/src/common/meta/src/distributed_time_constants.rs b/src/common/meta/src/distributed_time_constants.rs index d18b377c28..bd523cd901 100644 --- a/src/common/meta/src/distributed_time_constants.rs +++ b/src/common/meta/src/distributed_time_constants.rs @@ -41,6 +41,17 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30; /// In a lease, there are two opportunities for renewal. pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2; +/// The timeout of the heartbeat request. +pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1); + +/// The keep-alive interval of the heartbeat channel. +pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration = + Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1); + +/// The keep-alive timeout of the heartbeat channel. +pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration = + Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1); + /// The default mailbox round-trip timeout. pub const MAILBOX_RTT_SECS: u64 = 1; diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs index d819251597..fff34d6d26 100644 --- a/src/meta-client/src/client.rs +++ b/src/meta-client/src/client.rs @@ -189,6 +189,9 @@ impl MetaClientBuilder { let mgr = client.channel_manager.clone(); if self.enable_heartbeat { + if self.heartbeat_channel_manager.is_some() { + info!("Enable heartbeat channel using the heartbeat channel manager."); + } let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone()); client.heartbeat = Some(HeartbeatClient::new( self.id, diff --git a/src/meta-client/src/lib.rs b/src/meta-client/src/lib.rs index 5b56b8e181..715154a8e5 100644 --- a/src/meta-client/src/lib.rs +++ b/src/meta-client/src/lib.rs @@ -18,6 +18,10 @@ use std::time::Duration; use client::RegionFollowerClientRef; use common_base::Plugins; use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; +use common_meta::distributed_time_constants::{ + HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS, + HEARTBEAT_TIMEOUT, +}; use common_telemetry::{debug, info}; use serde::{Deserialize, Serialize}; @@ -34,8 +38,6 @@ pub struct MetaClientOptions { #[serde(with = "humantime_serde")] pub timeout: Duration, #[serde(with = "humantime_serde")] - pub heartbeat_timeout: Duration, - #[serde(with = "humantime_serde")] pub ddl_timeout: Duration, #[serde(with = "humantime_serde")] pub connect_timeout: Duration, @@ -52,7 +54,6 @@ impl Default for MetaClientOptions { Self { metasrv_addrs: vec!["127.0.0.1:3002".to_string()], timeout: Duration::from_millis(3_000u64), - heartbeat_timeout: Duration::from_millis(500u64), ddl_timeout: Duration::from_millis(10_000u64), connect_timeout: Duration::from_millis(1_000u64), tcp_nodelay: true, @@ -97,7 +98,11 @@ pub async fn create_meta_client( .timeout(meta_client_options.timeout) .connect_timeout(meta_client_options.connect_timeout) .tcp_nodelay(meta_client_options.tcp_nodelay); - let heartbeat_config = base_config.clone(); + let heartbeat_config = base_config + .clone() + .timeout(HEARTBEAT_TIMEOUT) + .http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS) + .http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS); if let MetaClientType::Frontend = client_type { let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);