mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-14 12:00:40 +00:00
feat: refine failure detector (#7005)
* feat: refine failure detector Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * fix format Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * revert back default value Signed-off-by: Ruihang Xia <waynestxia@gmail.com> * revert change of test Signed-off-by: Ruihang Xia <waynestxia@gmail.com> --------- Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
@@ -378,10 +378,9 @@
|
||||
| `procedure.max_metadata_value_size` | String | `1500KiB` | Auto split large value<br/>GreptimeDB procedure uses etcd as the default metadata storage backend.<br/>The etcd the maximum size of any request is 1.5 MiB<br/>1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)<br/>Comments out the `max_metadata_value_size`, for don't split large value (no limit). |
|
||||
| `procedure.max_running_procedures` | Integer | `128` | Max running procedures.<br/>The maximum number of procedures that can be running at the same time.<br/>If the number of running procedures exceeds this limit, the procedure will be rejected. |
|
||||
| `failure_detector` | -- | -- | -- |
|
||||
| `failure_detector.threshold` | Float | `8.0` | The threshold value used by the failure detector to determine failure conditions. |
|
||||
| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations. |
|
||||
| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable. |
|
||||
| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | The initial estimate of the heartbeat interval used by the failure detector. |
|
||||
| `failure_detector.threshold` | Float | `8.0` | Maximum acceptable φ before the peer is treated as failed.<br/>Lower values react faster but yield more false positives. |
|
||||
| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals.<br/>So tiny variations don’t make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary. |
|
||||
| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats.<br/>Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses. |
|
||||
| `datanode` | -- | -- | Datanode options. |
|
||||
| `datanode.client` | -- | -- | Datanode client options. |
|
||||
| `datanode.client.timeout` | String | `10s` | Operation timeout. |
|
||||
|
||||
@@ -149,20 +149,18 @@ max_metadata_value_size = "1500KiB"
|
||||
max_running_procedures = 128
|
||||
|
||||
# Failure detectors options.
|
||||
# GreptimeDB uses the Phi Accrual Failure Detector algorithm to detect datanode failures.
|
||||
[failure_detector]
|
||||
|
||||
## The threshold value used by the failure detector to determine failure conditions.
|
||||
## Maximum acceptable φ before the peer is treated as failed.
|
||||
## Lower values react faster but yield more false positives.
|
||||
threshold = 8.0
|
||||
|
||||
## The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations.
|
||||
## The minimum standard deviation of the heartbeat intervals.
|
||||
## So tiny variations don’t make φ explode. Prevents hypersensitivity when heartbeat intervals barely vary.
|
||||
min_std_deviation = "100ms"
|
||||
|
||||
## The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable.
|
||||
## The acceptable pause duration between heartbeats.
|
||||
## Additional extra grace period to the learned mean interval before φ rises, absorbing temporary network hiccups or GC pauses.
|
||||
acceptable_heartbeat_pause = "10000ms"
|
||||
|
||||
## The initial estimate of the heartbeat interval used by the failure detector.
|
||||
first_heartbeat_estimate = "1000ms"
|
||||
|
||||
## Datanode options.
|
||||
[datanode]
|
||||
|
||||
|
||||
@@ -399,7 +399,6 @@ mod tests {
|
||||
threshold = 8.0
|
||||
min_std_deviation = "100ms"
|
||||
acceptable_heartbeat_pause = "3000ms"
|
||||
first_heartbeat_estimate = "1000ms"
|
||||
"#;
|
||||
write!(file, "{}", toml_str).unwrap();
|
||||
|
||||
@@ -430,13 +429,6 @@ mod tests {
|
||||
.acceptable_heartbeat_pause
|
||||
.as_millis()
|
||||
);
|
||||
assert_eq!(
|
||||
1000,
|
||||
options
|
||||
.failure_detector
|
||||
.first_heartbeat_estimate
|
||||
.as_millis()
|
||||
);
|
||||
assert_eq!(
|
||||
options.procedure.max_metadata_value_size,
|
||||
Some(ReadableSize::kb(1500))
|
||||
|
||||
@@ -18,6 +18,8 @@ use std::time::Duration;
|
||||
use common_meta::distributed_time_constants;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
const FIRST_HEARTBEAT_ESTIMATE_MILLIS: i64 = 1000;
|
||||
|
||||
/// This is our port of Akka's "[PhiAccrualFailureDetector](https://github.com/akka/akka/blob/v2.6.21/akka-remote/src/main/scala/akka/remote/PhiAccrualFailureDetector.scala)"
|
||||
/// under Apache License 2.0.
|
||||
///
|
||||
@@ -56,10 +58,6 @@ pub(crate) struct PhiAccrualFailureDetector {
|
||||
/// arrivals, due to for example network drop.
|
||||
acceptable_heartbeat_pause_millis: u32,
|
||||
|
||||
/// Bootstrap the stats with heartbeats that corresponds to this duration, with a rather high
|
||||
/// standard deviation (since environment is unknown in the beginning).
|
||||
first_heartbeat_estimate_millis: u32,
|
||||
|
||||
heartbeat_history: HeartbeatHistory,
|
||||
last_heartbeat_millis: Option<i64>,
|
||||
}
|
||||
@@ -72,8 +70,6 @@ pub struct PhiAccrualFailureDetectorOptions {
|
||||
pub min_std_deviation: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub acceptable_heartbeat_pause: Duration,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub first_heartbeat_estimate: Duration,
|
||||
}
|
||||
|
||||
impl Default for PhiAccrualFailureDetectorOptions {
|
||||
@@ -86,7 +82,6 @@ impl Default for PhiAccrualFailureDetectorOptions {
|
||||
acceptable_heartbeat_pause: Duration::from_secs(
|
||||
distributed_time_constants::DATANODE_LEASE_SECS,
|
||||
),
|
||||
first_heartbeat_estimate: Duration::from_millis(1000),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -104,7 +99,6 @@ impl PhiAccrualFailureDetector {
|
||||
min_std_deviation_millis: options.min_std_deviation.as_millis() as f32,
|
||||
acceptable_heartbeat_pause_millis: options.acceptable_heartbeat_pause.as_millis()
|
||||
as u32,
|
||||
first_heartbeat_estimate_millis: options.first_heartbeat_estimate.as_millis() as u32,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
}
|
||||
@@ -124,11 +118,11 @@ impl PhiAccrualFailureDetector {
|
||||
// guess statistics for first heartbeat,
|
||||
// important so that connections with only one heartbeat becomes unavailable
|
||||
// bootstrap with 2 entries with rather high standard deviation
|
||||
let std_deviation = self.first_heartbeat_estimate_millis / 4;
|
||||
let std_deviation = FIRST_HEARTBEAT_ESTIMATE_MILLIS / 4;
|
||||
self.heartbeat_history
|
||||
.add((self.first_heartbeat_estimate_millis - std_deviation) as _);
|
||||
.add((FIRST_HEARTBEAT_ESTIMATE_MILLIS - std_deviation) as _);
|
||||
self.heartbeat_history
|
||||
.add((self.first_heartbeat_estimate_millis + std_deviation) as _);
|
||||
.add((FIRST_HEARTBEAT_ESTIMATE_MILLIS + std_deviation) as _);
|
||||
}
|
||||
let _ = self.last_heartbeat_millis.insert(ts_millis);
|
||||
}
|
||||
@@ -367,7 +361,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -381,14 +374,13 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
fd.heartbeat(0);
|
||||
assert!((fd.phi(1000)).abs() - 0.3 < 0.2);
|
||||
assert!((fd.phi(2000)).abs() - 4.5 < 0.3);
|
||||
assert!((fd.phi(3000)).abs() > 15.0);
|
||||
assert!((fd.phi(FIRST_HEARTBEAT_ESTIMATE_MILLIS)).abs() - 0.3 < 0.2);
|
||||
assert!((fd.phi(FIRST_HEARTBEAT_ESTIMATE_MILLIS * 2)).abs() - 4.5 < 0.3);
|
||||
assert!((fd.phi(FIRST_HEARTBEAT_ESTIMATE_MILLIS * 3)).abs() > 15.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -397,7 +389,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -413,7 +404,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -431,7 +421,6 @@ mod tests {
|
||||
threshold: 3.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -449,7 +438,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 3000,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -488,7 +476,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 3000,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -507,7 +494,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 3000,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(1000),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
@@ -528,7 +514,6 @@ mod tests {
|
||||
threshold: 8.0,
|
||||
min_std_deviation_millis: 100.0,
|
||||
acceptable_heartbeat_pause_millis: 0,
|
||||
first_heartbeat_estimate_millis: 1000,
|
||||
heartbeat_history: HeartbeatHistory::new(3),
|
||||
last_heartbeat_millis: None,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user