fix: configure HTTP/2 keep-alive for heartbeat client to detect network failures faster (#7344)

* fix: configure HTTP/2 keep-alive for heartbeat client to detect network failures faster

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
This commit is contained in:
Weny Xu
2025-12-04 16:07:45 +08:00
committed by GitHub
parent 11ecb7a28a
commit cc99f9d65b
8 changed files with 23 additions and 19 deletions

View File

@@ -294,7 +294,6 @@
| `meta_client` | -- | -- | The metasrv client options. | | `meta_client` | -- | -- | The metasrv client options. |
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
| `meta_client.timeout` | String | `3s` | Operation timeout. | | `meta_client.timeout` | String | `3s` | Operation timeout. |
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -457,7 +456,6 @@
| `meta_client` | -- | -- | The metasrv client options. | | `meta_client` | -- | -- | The metasrv client options. |
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
| `meta_client.timeout` | String | `3s` | Operation timeout. | | `meta_client.timeout` | String | `3s` | Operation timeout. |
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
@@ -629,7 +627,6 @@
| `meta_client` | -- | -- | The metasrv client options. | | `meta_client` | -- | -- | The metasrv client options. |
| `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. | | `meta_client.metasrv_addrs` | Array | -- | The addresses of the metasrv. |
| `meta_client.timeout` | String | `3s` | Operation timeout. | | `meta_client.timeout` | String | `3s` | Operation timeout. |
| `meta_client.heartbeat_timeout` | String | `500ms` | Heartbeat timeout. |
| `meta_client.ddl_timeout` | String | `10s` | DDL timeout. | | `meta_client.ddl_timeout` | String | `10s` | DDL timeout. |
| `meta_client.connect_timeout` | String | `1s` | Connect server timeout. | | `meta_client.connect_timeout` | String | `1s` | Connect server timeout. |
| `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. | | `meta_client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |

View File

@@ -99,9 +99,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
## Operation timeout. ## Operation timeout.
timeout = "3s" timeout = "3s"
## Heartbeat timeout.
heartbeat_timeout = "500ms"
## DDL timeout. ## DDL timeout.
ddl_timeout = "10s" ddl_timeout = "10s"

View File

@@ -78,9 +78,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
## Operation timeout. ## Operation timeout.
timeout = "3s" timeout = "3s"
## Heartbeat timeout.
heartbeat_timeout = "500ms"
## DDL timeout. ## DDL timeout.
ddl_timeout = "10s" ddl_timeout = "10s"

View File

@@ -226,9 +226,6 @@ metasrv_addrs = ["127.0.0.1:3002"]
## Operation timeout. ## Operation timeout.
timeout = "3s" timeout = "3s"
## Heartbeat timeout.
heartbeat_timeout = "500ms"
## DDL timeout. ## DDL timeout.
ddl_timeout = "10s" ddl_timeout = "10s"

View File

@@ -52,7 +52,6 @@ fn test_load_datanode_example_config() {
meta_client: Some(MetaClientOptions { meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()], metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
timeout: Duration::from_secs(3), timeout: Duration::from_secs(3),
heartbeat_timeout: Duration::from_millis(500),
ddl_timeout: Duration::from_secs(10), ddl_timeout: Duration::from_secs(10),
connect_timeout: Duration::from_secs(1), connect_timeout: Duration::from_secs(1),
tcp_nodelay: true, tcp_nodelay: true,
@@ -118,7 +117,6 @@ fn test_load_frontend_example_config() {
meta_client: Some(MetaClientOptions { meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()], metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
timeout: Duration::from_secs(3), timeout: Duration::from_secs(3),
heartbeat_timeout: Duration::from_millis(500),
ddl_timeout: Duration::from_secs(10), ddl_timeout: Duration::from_secs(10),
connect_timeout: Duration::from_secs(1), connect_timeout: Duration::from_secs(1),
tcp_nodelay: true, tcp_nodelay: true,
@@ -241,7 +239,6 @@ fn test_load_flownode_example_config() {
meta_client: Some(MetaClientOptions { meta_client: Some(MetaClientOptions {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()], metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
timeout: Duration::from_secs(3), timeout: Duration::from_secs(3),
heartbeat_timeout: Duration::from_millis(500),
ddl_timeout: Duration::from_secs(10), ddl_timeout: Duration::from_secs(10),
connect_timeout: Duration::from_secs(1), connect_timeout: Duration::from_secs(1),
tcp_nodelay: true, tcp_nodelay: true,

View File

@@ -41,6 +41,17 @@ pub const POSTGRES_KEEP_ALIVE_SECS: u64 = 30;
/// In a lease, there are two opportunities for renewal. /// In a lease, there are two opportunities for renewal.
pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2; pub const META_KEEP_ALIVE_INTERVAL_SECS: u64 = META_LEASE_SECS / 2;
/// The timeout of the heartbeat request.
pub const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
/// The keep-alive interval of the heartbeat channel.
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS: Duration =
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
/// The keep-alive timeout of the heartbeat channel.
pub const HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS: Duration =
Duration::from_secs(META_KEEP_ALIVE_INTERVAL_SECS + 1);
/// The default mailbox round-trip timeout. /// The default mailbox round-trip timeout.
pub const MAILBOX_RTT_SECS: u64 = 1; pub const MAILBOX_RTT_SECS: u64 = 1;

View File

@@ -189,6 +189,9 @@ impl MetaClientBuilder {
let mgr = client.channel_manager.clone(); let mgr = client.channel_manager.clone();
if self.enable_heartbeat { if self.enable_heartbeat {
if self.heartbeat_channel_manager.is_some() {
info!("Enable heartbeat channel using the heartbeat channel manager.");
}
let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone()); let mgr = self.heartbeat_channel_manager.unwrap_or(mgr.clone());
client.heartbeat = Some(HeartbeatClient::new( client.heartbeat = Some(HeartbeatClient::new(
self.id, self.id,

View File

@@ -18,6 +18,10 @@ use std::time::Duration;
use client::RegionFollowerClientRef; use client::RegionFollowerClientRef;
use common_base::Plugins; use common_base::Plugins;
use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; use common_grpc::channel_manager::{ChannelConfig, ChannelManager};
use common_meta::distributed_time_constants::{
HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS, HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS,
HEARTBEAT_TIMEOUT,
};
use common_telemetry::{debug, info}; use common_telemetry::{debug, info};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -34,8 +38,6 @@ pub struct MetaClientOptions {
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub timeout: Duration, pub timeout: Duration,
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub heartbeat_timeout: Duration,
#[serde(with = "humantime_serde")]
pub ddl_timeout: Duration, pub ddl_timeout: Duration,
#[serde(with = "humantime_serde")] #[serde(with = "humantime_serde")]
pub connect_timeout: Duration, pub connect_timeout: Duration,
@@ -52,7 +54,6 @@ impl Default for MetaClientOptions {
Self { Self {
metasrv_addrs: vec!["127.0.0.1:3002".to_string()], metasrv_addrs: vec!["127.0.0.1:3002".to_string()],
timeout: Duration::from_millis(3_000u64), timeout: Duration::from_millis(3_000u64),
heartbeat_timeout: Duration::from_millis(500u64),
ddl_timeout: Duration::from_millis(10_000u64), ddl_timeout: Duration::from_millis(10_000u64),
connect_timeout: Duration::from_millis(1_000u64), connect_timeout: Duration::from_millis(1_000u64),
tcp_nodelay: true, tcp_nodelay: true,
@@ -97,7 +98,11 @@ pub async fn create_meta_client(
.timeout(meta_client_options.timeout) .timeout(meta_client_options.timeout)
.connect_timeout(meta_client_options.connect_timeout) .connect_timeout(meta_client_options.connect_timeout)
.tcp_nodelay(meta_client_options.tcp_nodelay); .tcp_nodelay(meta_client_options.tcp_nodelay);
let heartbeat_config = base_config.clone(); let heartbeat_config = base_config
.clone()
.timeout(HEARTBEAT_TIMEOUT)
.http2_keep_alive_interval(HEARTBEAT_CHANNEL_KEEP_ALIVE_INTERVAL_SECS)
.http2_keep_alive_timeout(HEARTBEAT_CHANNEL_KEEP_ALIVE_TIMEOUT_SECS);
if let MetaClientType::Frontend = client_type { if let MetaClientType::Frontend = client_type {
let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout); let ddl_config = base_config.clone().timeout(meta_client_options.ddl_timeout);