1#[cfg(test)]
16mod tests;
17
18use std::sync::Arc;
19
20use api::v1::meta::heartbeat_request::NodeWorkloads;
21use api::v1::meta::{FrontendWorkloads, HeartbeatRequest, NodeInfo, Peer};
22use common_meta::datanode::EnvVars;
23use common_meta::heartbeat::handler::{
24 HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
25};
26use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
27use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
28use common_stat::ResourceStatRef;
29use common_telemetry::{debug, error, info, warn};
30use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
31use servers::addrs;
32use snafu::ResultExt;
33use tokio::sync::mpsc;
34use tokio::sync::mpsc::Receiver;
35use tokio::time::{Duration, Instant};
36
37use crate::error;
38use crate::error::Result;
39use crate::frontend::FrontendOptions;
40use crate::metrics::{HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT};
41
42#[derive(Clone)]
44pub struct HeartbeatTask {
45 peer_addr: String,
46 meta_client: Arc<MetaClient>,
47 resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
48 start_time_ms: u64,
49 resource_stat: ResourceStatRef,
50 env_vars: EnvVars,
51}
52
53impl HeartbeatTask {
54 pub fn new(
55 peer_addr: String,
56 opts: &FrontendOptions,
57 meta_client: Arc<MetaClient>,
58 resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
59 resource_stat: ResourceStatRef,
60 ) -> Self {
61 HeartbeatTask {
62 peer_addr,
63 meta_client,
64 resp_handler_executor,
65 start_time_ms: common_time::util::current_time_millis() as u64,
66 resource_stat,
67 env_vars: EnvVars::from_config(&opts.heartbeat_env_vars),
68 }
69 }
70
71 pub async fn start(&self) -> Result<()> {
72 let (req_sender, resp_stream, config) = self
73 .meta_client
74 .heartbeat()
75 .await
76 .context(error::CreateMetaHeartbeatStreamSnafu)?;
77
78 info!("Heartbeat started with Metasrv config: {}", config);
79
80 let (outgoing_tx, outgoing_rx) = mpsc::channel(16);
81 let mailbox = Arc::new(HeartbeatMailbox::new(outgoing_tx));
82
83 self.start_handle_resp_stream(resp_stream, mailbox, config.retry_interval);
84
85 self.start_heartbeat_report(req_sender, outgoing_rx, config.interval);
86
87 Ok(())
88 }
89
90 fn start_handle_resp_stream(
91 &self,
92 mut resp_stream: HeartbeatStream,
93 mailbox: MailboxRef,
94 retry_interval: Duration,
95 ) {
96 let capture_self = self.clone();
97
98 let _handle = common_runtime::spawn_hb(async move {
99 loop {
100 match resp_stream.message().await {
101 Ok(Some(resp)) => {
102 debug!("Receiving heartbeat response: {:?}", resp);
103 if let Some(message) = &resp.mailbox_message {
104 info!("Received mailbox message: {message:?}");
105 }
106 let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp);
107 if let Err(e) = capture_self.handle_response(ctx).await {
108 error!(e; "Error while handling heartbeat response");
109 HEARTBEAT_RECV_COUNT
110 .with_label_values(&["processing_error"])
111 .inc();
112 } else {
113 HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
114 }
115 }
116 Ok(None) => {
117 warn!("Heartbeat response stream closed");
118 capture_self.start_with_retry(retry_interval).await;
119 break;
120 }
121 Err(e) => {
122 HEARTBEAT_RECV_COUNT.with_label_values(&["error"]).inc();
123 error!(e; "Occur error while reading heartbeat response");
124 capture_self.start_with_retry(retry_interval).await;
125
126 break;
127 }
128 }
129 }
130 });
131 }
132
133 fn new_heartbeat_request(
134 heartbeat_request: &HeartbeatRequest,
135 message: Option<OutgoingMessage>,
136 cpu_usage: i64,
137 memory_usage: i64,
138 ) -> Option<HeartbeatRequest> {
139 let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
140 Some(Ok(message)) => Some(message),
141 Some(Err(e)) => {
142 error!(e; "Failed to encode mailbox messages");
143 return None;
144 }
145 None => None,
146 };
147
148 let mut heartbeat_request = HeartbeatRequest {
149 mailbox_message,
150 ..heartbeat_request.clone()
151 };
152
153 if let Some(info) = heartbeat_request.info.as_mut() {
154 info.memory_usage_bytes = memory_usage;
155 info.cpu_usage_millicores = cpu_usage;
156 }
157
158 Some(heartbeat_request)
159 }
160
161 #[allow(deprecated)]
162 fn build_node_info(
163 start_time_ms: u64,
164 total_cpu_millicores: i64,
165 total_memory_bytes: i64,
166 ) -> Option<NodeInfo> {
167 let build_info = common_version::build_info();
168
169 Some(NodeInfo {
170 version: build_info.version.to_string(),
171 git_commit: build_info.commit_short.to_string(),
172 start_time_ms,
173 total_cpu_millicores,
174 total_memory_bytes,
175 cpu_usage_millicores: 0,
176 memory_usage_bytes: 0,
177 cpus: total_cpu_millicores as u32,
179 memory_bytes: total_memory_bytes as u64,
180 hostname: hostname::get()
181 .unwrap_or_default()
182 .to_string_lossy()
183 .to_string(),
184 })
185 }
186
187 fn start_heartbeat_report(
188 &self,
189 req_sender: HeartbeatSender,
190 mut outgoing_rx: Receiver<OutgoingMessage>,
191 report_interval: Duration,
192 ) {
193 let start_time_ms = self.start_time_ms;
194 let self_peer = Some(Peer {
195 id: 0,
198 addr: self.peer_addr.clone(),
199 });
200 let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
201 let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
202 let resource_stat = self.resource_stat.clone();
203 let env_vars = self.env_vars.clone();
204 common_runtime::spawn_hb(async move {
205 let sleep = tokio::time::sleep(Duration::from_millis(0));
206 tokio::pin!(sleep);
207
208 let mut extensions = std::collections::HashMap::new();
209 env_vars.into_extensions(&mut extensions);
210
211 let heartbeat_request = HeartbeatRequest {
212 peer: self_peer,
213 info: Self::build_node_info(
214 start_time_ms,
215 total_cpu_millicores,
216 total_memory_bytes,
217 ),
218 node_workloads: Some(NodeWorkloads::Frontend(FrontendWorkloads { types: vec![] })),
219 extensions,
220 ..Default::default()
221 };
222
223 loop {
224 let req = tokio::select! {
225 message = outgoing_rx.recv() => {
226 if let Some(message) = message {
227 Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
228 } else {
229 warn!("Sender has been dropped, exiting the heartbeat loop");
230 break
232 }
233 }
234 _ = &mut sleep => {
235 sleep.as_mut().reset(Instant::now() + report_interval);
236 Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
237 }
238 };
239
240 if let Some(req) = req {
241 if let Err(e) = req_sender.send(req.clone()).await {
242 error!(e; "Failed to send heartbeat to metasrv");
243 break;
244 } else {
245 HEARTBEAT_SENT_COUNT.inc();
246 debug!("Send a heartbeat request to metasrv, content: {:?}", req);
247 }
248 }
249 }
250 });
251 }
252
253 async fn handle_response(&self, ctx: HeartbeatResponseHandlerContext) -> Result<()> {
254 self.resp_handler_executor
255 .handle(ctx)
256 .await
257 .context(error::HandleHeartbeatResponseSnafu)
258 }
259
260 async fn start_with_retry(&self, retry_interval: Duration) {
261 loop {
262 tokio::time::sleep(retry_interval).await;
263
264 info!("Try to re-establish the heartbeat connection to metasrv.");
265
266 if self.start().await.is_ok() {
267 break;
268 }
269 }
270 }
271}
272
273pub(crate) fn frontend_peer_addr(opts: &FrontendOptions) -> String {
274 if let Some(internal) = &opts.internal_grpc {
278 addrs::resolve_addr(&internal.bind_addr, Some(&internal.server_addr))
279 } else {
280 addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
281 }
282}