Skip to main content

frontend/
heartbeat.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#[cfg(test)]
16mod tests;
17
18use std::sync::Arc;
19
20use api::v1::meta::{HeartbeatRequest, NodeInfo, Peer};
21use common_meta::datanode::EnvVars;
22use common_meta::heartbeat::handler::{
23    HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
24};
25use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
26use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
27use common_stat::ResourceStatRef;
28use common_telemetry::{debug, error, info, warn};
29use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
30use servers::addrs;
31use snafu::ResultExt;
32use tokio::sync::mpsc;
33use tokio::sync::mpsc::Receiver;
34use tokio::time::{Duration, Instant};
35
36use crate::error;
37use crate::error::Result;
38use crate::frontend::FrontendOptions;
39use crate::metrics::{HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT};
40
41/// The frontend heartbeat task which sending `[HeartbeatRequest]` to Metasrv periodically in background.
42#[derive(Clone)]
43pub struct HeartbeatTask {
44    peer_addr: String,
45    meta_client: Arc<MetaClient>,
46    resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
47    start_time_ms: u64,
48    resource_stat: ResourceStatRef,
49    env_vars: EnvVars,
50}
51
52impl HeartbeatTask {
53    pub fn new(
54        opts: &FrontendOptions,
55        meta_client: Arc<MetaClient>,
56        resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
57        resource_stat: ResourceStatRef,
58    ) -> Self {
59        HeartbeatTask {
60            // if internal grpc is configured, use its address as the peer address
61            // otherwise use the public grpc address, because peer address only promises to be reachable
62            // by other components, it doesn't matter whether it's internal or external
63            peer_addr: if let Some(internal) = &opts.internal_grpc {
64                addrs::resolve_addr(&internal.bind_addr, Some(&internal.server_addr))
65            } else {
66                addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
67            },
68            meta_client,
69            resp_handler_executor,
70            start_time_ms: common_time::util::current_time_millis() as u64,
71            resource_stat,
72            env_vars: EnvVars::from_config(&opts.heartbeat_env_vars),
73        }
74    }
75
76    pub async fn start(&self) -> Result<()> {
77        let (req_sender, resp_stream, config) = self
78            .meta_client
79            .heartbeat()
80            .await
81            .context(error::CreateMetaHeartbeatStreamSnafu)?;
82
83        info!("Heartbeat started with Metasrv config: {}", config);
84
85        let (outgoing_tx, outgoing_rx) = mpsc::channel(16);
86        let mailbox = Arc::new(HeartbeatMailbox::new(outgoing_tx));
87
88        self.start_handle_resp_stream(resp_stream, mailbox, config.retry_interval);
89
90        self.start_heartbeat_report(req_sender, outgoing_rx, config.interval);
91
92        Ok(())
93    }
94
95    fn start_handle_resp_stream(
96        &self,
97        mut resp_stream: HeartbeatStream,
98        mailbox: MailboxRef,
99        retry_interval: Duration,
100    ) {
101        let capture_self = self.clone();
102
103        let _handle = common_runtime::spawn_hb(async move {
104            loop {
105                match resp_stream.message().await {
106                    Ok(Some(resp)) => {
107                        debug!("Receiving heartbeat response: {:?}", resp);
108                        if let Some(message) = &resp.mailbox_message {
109                            info!("Received mailbox message: {message:?}");
110                        }
111                        let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp);
112                        if let Err(e) = capture_self.handle_response(ctx).await {
113                            error!(e; "Error while handling heartbeat response");
114                            HEARTBEAT_RECV_COUNT
115                                .with_label_values(&["processing_error"])
116                                .inc();
117                        } else {
118                            HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
119                        }
120                    }
121                    Ok(None) => {
122                        warn!("Heartbeat response stream closed");
123                        capture_self.start_with_retry(retry_interval).await;
124                        break;
125                    }
126                    Err(e) => {
127                        HEARTBEAT_RECV_COUNT.with_label_values(&["error"]).inc();
128                        error!(e; "Occur error while reading heartbeat response");
129                        capture_self.start_with_retry(retry_interval).await;
130
131                        break;
132                    }
133                }
134            }
135        });
136    }
137
138    fn new_heartbeat_request(
139        heartbeat_request: &HeartbeatRequest,
140        message: Option<OutgoingMessage>,
141        cpu_usage: i64,
142        memory_usage: i64,
143    ) -> Option<HeartbeatRequest> {
144        let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
145            Some(Ok(message)) => Some(message),
146            Some(Err(e)) => {
147                error!(e; "Failed to encode mailbox messages");
148                return None;
149            }
150            None => None,
151        };
152
153        let mut heartbeat_request = HeartbeatRequest {
154            mailbox_message,
155            ..heartbeat_request.clone()
156        };
157
158        if let Some(info) = heartbeat_request.info.as_mut() {
159            info.memory_usage_bytes = memory_usage;
160            info.cpu_usage_millicores = cpu_usage;
161        }
162
163        Some(heartbeat_request)
164    }
165
166    #[allow(deprecated)]
167    fn build_node_info(
168        start_time_ms: u64,
169        total_cpu_millicores: i64,
170        total_memory_bytes: i64,
171    ) -> Option<NodeInfo> {
172        let build_info = common_version::build_info();
173
174        Some(NodeInfo {
175            version: build_info.version.to_string(),
176            git_commit: build_info.commit_short.to_string(),
177            start_time_ms,
178            total_cpu_millicores,
179            total_memory_bytes,
180            cpu_usage_millicores: 0,
181            memory_usage_bytes: 0,
182            // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
183            cpus: total_cpu_millicores as u32,
184            memory_bytes: total_memory_bytes as u64,
185            hostname: hostname::get()
186                .unwrap_or_default()
187                .to_string_lossy()
188                .to_string(),
189        })
190    }
191
192    fn start_heartbeat_report(
193        &self,
194        req_sender: HeartbeatSender,
195        mut outgoing_rx: Receiver<OutgoingMessage>,
196        report_interval: Duration,
197    ) {
198        let start_time_ms = self.start_time_ms;
199        let self_peer = Some(Peer {
200            // The node id will be actually calculated from its address (by hashing the address
201            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
202            id: 0,
203            addr: self.peer_addr.clone(),
204        });
205        let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
206        let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
207        let resource_stat = self.resource_stat.clone();
208        let env_vars = self.env_vars.clone();
209        common_runtime::spawn_hb(async move {
210            let sleep = tokio::time::sleep(Duration::from_millis(0));
211            tokio::pin!(sleep);
212
213            let mut extensions = std::collections::HashMap::new();
214            env_vars.into_extensions(&mut extensions);
215
216            let heartbeat_request = HeartbeatRequest {
217                peer: self_peer,
218                info: Self::build_node_info(
219                    start_time_ms,
220                    total_cpu_millicores,
221                    total_memory_bytes,
222                ),
223                extensions,
224                ..Default::default()
225            };
226
227            loop {
228                let req = tokio::select! {
229                    message = outgoing_rx.recv() => {
230                        if let Some(message) = message {
231                            Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
232                        } else {
233                            warn!("Sender has been dropped, exiting the heartbeat loop");
234                            // Receives None that means Sender was dropped, we need to break the current loop
235                            break
236                        }
237                    }
238                    _ = &mut sleep => {
239                       sleep.as_mut().reset(Instant::now() + report_interval);
240                       Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
241                    }
242                };
243
244                if let Some(req) = req {
245                    if let Err(e) = req_sender.send(req.clone()).await {
246                        error!(e; "Failed to send heartbeat to metasrv");
247                        break;
248                    } else {
249                        HEARTBEAT_SENT_COUNT.inc();
250                        debug!("Send a heartbeat request to metasrv, content: {:?}", req);
251                    }
252                }
253            }
254        });
255    }
256
257    async fn handle_response(&self, ctx: HeartbeatResponseHandlerContext) -> Result<()> {
258        self.resp_handler_executor
259            .handle(ctx)
260            .await
261            .context(error::HandleHeartbeatResponseSnafu)
262    }
263
264    async fn start_with_retry(&self, retry_interval: Duration) {
265        loop {
266            tokio::time::sleep(retry_interval).await;
267
268            info!("Try to re-establish the heartbeat connection to metasrv.");
269
270            if self.start().await.is_ok() {
271                break;
272            }
273        }
274    }
275}