1#[cfg(test)]
16mod tests;
17
18use std::sync::Arc;
19
20use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer};
21use common_meta::datanode::EnvVars;
22use common_meta::heartbeat::handler::{
23 HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
24};
25use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
26use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
27use common_stat::ResourceStatRef;
28use common_telemetry::{debug, error, info, warn};
29use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
30use servers::addrs;
31use snafu::ResultExt;
32use tokio::sync::mpsc;
33use tokio::sync::mpsc::Receiver;
34use tokio::time::{Duration, Instant};
35
36use crate::error;
37use crate::error::Result;
38use crate::frontend::FrontendOptions;
39use crate::metrics::{HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT};
40
41#[derive(Clone)]
43pub struct HeartbeatTask {
44 peer_addr: String,
45 meta_client: Arc<MetaClient>,
46 resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
47 start_time_ms: u64,
48 resource_stat: ResourceStatRef,
49 env_vars: EnvVars,
50}
51
52impl HeartbeatTask {
53 pub fn new(
54 opts: &FrontendOptions,
55 meta_client: Arc<MetaClient>,
56 resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
57 resource_stat: ResourceStatRef,
58 ) -> Self {
59 HeartbeatTask {
60 peer_addr: if let Some(internal) = &opts.internal_grpc {
64 addrs::resolve_addr(&internal.bind_addr, Some(&internal.server_addr))
65 } else {
66 addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
67 },
68 meta_client,
69 resp_handler_executor,
70 start_time_ms: common_time::util::current_time_millis() as u64,
71 resource_stat,
72 env_vars: EnvVars::from_config(&opts.heartbeat_env_vars),
73 }
74 }
75
76 pub async fn start(&self) -> Result<()> {
77 let (req_sender, resp_stream, config) = self
78 .meta_client
79 .heartbeat()
80 .await
81 .context(error::CreateMetaHeartbeatStreamSnafu)?;
82
83 info!("Heartbeat started with Metasrv config: {}", config);
84
85 let (outgoing_tx, outgoing_rx) = mpsc::channel(16);
86 let mailbox = Arc::new(HeartbeatMailbox::new(outgoing_tx));
87
88 self.start_handle_resp_stream(resp_stream, mailbox, config.retry_interval);
89
90 self.start_heartbeat_report(req_sender, outgoing_rx, config.interval);
91
92 Ok(())
93 }
94
95 fn start_handle_resp_stream(
96 &self,
97 mut resp_stream: HeartbeatStream,
98 mailbox: MailboxRef,
99 retry_interval: Duration,
100 ) {
101 let capture_self = self.clone();
102
103 let _handle = common_runtime::spawn_hb(async move {
104 loop {
105 match resp_stream.message().await {
106 Ok(Some(resp)) => {
107 debug!("Receiving heartbeat response: {:?}", resp);
108 if let Some(message) = &resp.mailbox_message {
109 info!("Received mailbox message: {message:?}");
110 }
111 let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp);
112 if let Err(e) = capture_self.handle_response(ctx).await {
113 error!(e; "Error while handling heartbeat response");
114 HEARTBEAT_RECV_COUNT
115 .with_label_values(&["processing_error"])
116 .inc();
117 } else {
118 HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
119 }
120 }
121 Ok(None) => {
122 warn!("Heartbeat response stream closed");
123 capture_self.start_with_retry(retry_interval).await;
124 break;
125 }
126 Err(e) => {
127 HEARTBEAT_RECV_COUNT.with_label_values(&["error"]).inc();
128 error!(e; "Occur error while reading heartbeat response");
129 capture_self.start_with_retry(retry_interval).await;
130
131 break;
132 }
133 }
134 }
135 });
136 }
137
138 fn new_heartbeat_request(
139 heartbeat_request: &HeartbeatRequest,
140 message: Option<OutgoingMessage>,
141 cpu_usage: i64,
142 memory_usage: i64,
143 ) -> Option<HeartbeatRequest> {
144 let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
145 Some(Ok(message)) => Some(message),
146 Some(Err(e)) => {
147 error!(e; "Failed to encode mailbox messages");
148 return None;
149 }
150 None => None,
151 };
152
153 let mut heartbeat_request = HeartbeatRequest {
154 mailbox_message,
155 ..heartbeat_request.clone()
156 };
157
158 if let Some(info) = heartbeat_request.info.as_mut() {
159 info.memory_usage_bytes = memory_usage;
160 info.cpu_usage_millicores = cpu_usage;
161 }
162
163 Some(heartbeat_request)
164 }
165
166 #[allow(deprecated)]
167 fn build_node_info(
168 start_time_ms: u64,
169 total_cpu_millicores: i64,
170 total_memory_bytes: i64,
171 ) -> Option<NodeInfo> {
172 let build_info = common_version::build_info();
173
174 Some(NodeInfo {
175 version: build_info.version.to_string(),
176 git_commit: build_info.commit_short.to_string(),
177 start_time_ms,
178 total_cpu_millicores,
179 total_memory_bytes,
180 cpu_usage_millicores: 0,
181 memory_usage_bytes: 0,
182 cpus: total_cpu_millicores as u32,
184 memory_bytes: total_memory_bytes as u64,
185 hostname: hostname::get()
186 .unwrap_or_default()
187 .to_string_lossy()
188 .to_string(),
189 })
190 }
191
192 fn start_heartbeat_report(
193 &self,
194 req_sender: HeartbeatSender,
195 mut outgoing_rx: Receiver<OutgoingMessage>,
196 report_interval: Duration,
197 ) {
198 let start_time_ms = self.start_time_ms;
199 let self_peer = Some(Peer {
200 id: 0,
203 addr: self.peer_addr.clone(),
204 });
205 let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
206 let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
207 let resource_stat = self.resource_stat.clone();
208 let env_vars = self.env_vars.clone();
209 common_runtime::spawn_hb(async move {
210 let sleep = tokio::time::sleep(Duration::from_millis(0));
211 tokio::pin!(sleep);
212
213 let mut extensions = std::collections::HashMap::new();
214 env_vars.into_extensions(&mut extensions);
215
216 let heartbeat_request = HeartbeatRequest {
217 peer: self_peer,
218 info: Self::build_node_info(
219 start_time_ms,
220 total_cpu_millicores,
221 total_memory_bytes,
222 ),
223 extensions,
224 ..Default::default()
225 };
226
227 loop {
228 let req = tokio::select! {
229 message = outgoing_rx.recv() => {
230 if let Some(message) = message {
231 Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
232 } else {
233 warn!("Sender has been dropped, exiting the heartbeat loop");
234 break
236 }
237 }
238 _ = &mut sleep => {
239 sleep.as_mut().reset(Instant::now() + report_interval);
240 Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
241 }
242 };
243
244 if let Some(req) = req {
245 if let Err(e) = req_sender.send(req.clone()).await {
246 error!(e; "Failed to send heartbeat to metasrv");
247 break;
248 } else {
249 HEARTBEAT_SENT_COUNT.inc();
250 debug!("Send a heartbeat request to metasrv, content: {:?}", req);
251 }
252 }
253 }
254 });
255 }
256
257 async fn handle_response(&self, ctx: HeartbeatResponseHandlerContext) -> Result<()> {
258 self.resp_handler_executor
259 .handle(ctx)
260 .await
261 .context(error::HandleHeartbeatResponseSnafu)
262 }
263
264 async fn start_with_retry(&self, retry_interval: Duration) {
265 loop {
266 tokio::time::sleep(retry_interval).await;
267
268 info!("Try to re-establish the heartbeat connection to metasrv.");
269
270 if self.start().await.is_ok() {
271 break;
272 }
273 }
274 }
275}