Skip to main content

frontend/
heartbeat.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#[cfg(test)]
16mod tests;
17
18use std::sync::Arc;
19
20use api::v1::meta::heartbeat_request::NodeWorkloads;
21use api::v1::meta::{FrontendWorkloads, HeartbeatRequest, NodeInfo, Peer};
22use common_meta::datanode::EnvVars;
23use common_meta::heartbeat::handler::{
24    HeartbeatResponseHandlerContext, HeartbeatResponseHandlerExecutorRef,
25};
26use common_meta::heartbeat::mailbox::{HeartbeatMailbox, MailboxRef, OutgoingMessage};
27use common_meta::heartbeat::utils::outgoing_message_to_mailbox_message;
28use common_stat::ResourceStatRef;
29use common_telemetry::{debug, error, info, warn};
30use meta_client::client::{HeartbeatSender, HeartbeatStream, MetaClient};
31use servers::addrs;
32use snafu::ResultExt;
33use tokio::sync::mpsc;
34use tokio::sync::mpsc::Receiver;
35use tokio::time::{Duration, Instant};
36
37use crate::error;
38use crate::error::Result;
39use crate::frontend::FrontendOptions;
40use crate::metrics::{HEARTBEAT_RECV_COUNT, HEARTBEAT_SENT_COUNT};
41
42/// The frontend heartbeat task which sending `[HeartbeatRequest]` to Metasrv periodically in background.
43#[derive(Clone)]
44pub struct HeartbeatTask {
45    peer_addr: String,
46    meta_client: Arc<MetaClient>,
47    resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
48    start_time_ms: u64,
49    resource_stat: ResourceStatRef,
50    env_vars: EnvVars,
51}
52
53impl HeartbeatTask {
54    pub fn new(
55        peer_addr: String,
56        opts: &FrontendOptions,
57        meta_client: Arc<MetaClient>,
58        resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
59        resource_stat: ResourceStatRef,
60    ) -> Self {
61        HeartbeatTask {
62            peer_addr,
63            meta_client,
64            resp_handler_executor,
65            start_time_ms: common_time::util::current_time_millis() as u64,
66            resource_stat,
67            env_vars: EnvVars::from_config(&opts.heartbeat_env_vars),
68        }
69    }
70
71    pub async fn start(&self) -> Result<()> {
72        let (req_sender, resp_stream, config) = self
73            .meta_client
74            .heartbeat()
75            .await
76            .context(error::CreateMetaHeartbeatStreamSnafu)?;
77
78        info!("Heartbeat started with Metasrv config: {}", config);
79
80        let (outgoing_tx, outgoing_rx) = mpsc::channel(16);
81        let mailbox = Arc::new(HeartbeatMailbox::new(outgoing_tx));
82
83        self.start_handle_resp_stream(resp_stream, mailbox, config.retry_interval);
84
85        self.start_heartbeat_report(req_sender, outgoing_rx, config.interval);
86
87        Ok(())
88    }
89
90    fn start_handle_resp_stream(
91        &self,
92        mut resp_stream: HeartbeatStream,
93        mailbox: MailboxRef,
94        retry_interval: Duration,
95    ) {
96        let capture_self = self.clone();
97
98        let _handle = common_runtime::spawn_hb(async move {
99            loop {
100                match resp_stream.message().await {
101                    Ok(Some(resp)) => {
102                        debug!("Receiving heartbeat response: {:?}", resp);
103                        if let Some(message) = &resp.mailbox_message {
104                            info!("Received mailbox message: {message:?}");
105                        }
106                        let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), resp);
107                        if let Err(e) = capture_self.handle_response(ctx).await {
108                            error!(e; "Error while handling heartbeat response");
109                            HEARTBEAT_RECV_COUNT
110                                .with_label_values(&["processing_error"])
111                                .inc();
112                        } else {
113                            HEARTBEAT_RECV_COUNT.with_label_values(&["success"]).inc();
114                        }
115                    }
116                    Ok(None) => {
117                        warn!("Heartbeat response stream closed");
118                        capture_self.start_with_retry(retry_interval).await;
119                        break;
120                    }
121                    Err(e) => {
122                        HEARTBEAT_RECV_COUNT.with_label_values(&["error"]).inc();
123                        error!(e; "Occur error while reading heartbeat response");
124                        capture_self.start_with_retry(retry_interval).await;
125
126                        break;
127                    }
128                }
129            }
130        });
131    }
132
133    fn new_heartbeat_request(
134        heartbeat_request: &HeartbeatRequest,
135        message: Option<OutgoingMessage>,
136        cpu_usage: i64,
137        memory_usage: i64,
138    ) -> Option<HeartbeatRequest> {
139        let mailbox_message = match message.map(outgoing_message_to_mailbox_message) {
140            Some(Ok(message)) => Some(message),
141            Some(Err(e)) => {
142                error!(e; "Failed to encode mailbox messages");
143                return None;
144            }
145            None => None,
146        };
147
148        let mut heartbeat_request = HeartbeatRequest {
149            mailbox_message,
150            ..heartbeat_request.clone()
151        };
152
153        if let Some(info) = heartbeat_request.info.as_mut() {
154            info.memory_usage_bytes = memory_usage;
155            info.cpu_usage_millicores = cpu_usage;
156        }
157
158        Some(heartbeat_request)
159    }
160
161    #[allow(deprecated)]
162    fn build_node_info(
163        start_time_ms: u64,
164        total_cpu_millicores: i64,
165        total_memory_bytes: i64,
166    ) -> Option<NodeInfo> {
167        let build_info = common_version::build_info();
168
169        Some(NodeInfo {
170            version: build_info.version.to_string(),
171            git_commit: build_info.commit_short.to_string(),
172            start_time_ms,
173            total_cpu_millicores,
174            total_memory_bytes,
175            cpu_usage_millicores: 0,
176            memory_usage_bytes: 0,
177            // TODO(zyy17): Remove these deprecated fields when the deprecated fields are removed from the proto.
178            cpus: total_cpu_millicores as u32,
179            memory_bytes: total_memory_bytes as u64,
180            hostname: hostname::get()
181                .unwrap_or_default()
182                .to_string_lossy()
183                .to_string(),
184        })
185    }
186
187    fn start_heartbeat_report(
188        &self,
189        req_sender: HeartbeatSender,
190        mut outgoing_rx: Receiver<OutgoingMessage>,
191        report_interval: Duration,
192    ) {
193        let start_time_ms = self.start_time_ms;
194        let self_peer = Some(Peer {
195            // The node id will be actually calculated from its address (by hashing the address
196            // string) in the metasrv. So it can be set to 0 here, as a placeholder.
197            id: 0,
198            addr: self.peer_addr.clone(),
199        });
200        let total_cpu_millicores = self.resource_stat.get_total_cpu_millicores();
201        let total_memory_bytes = self.resource_stat.get_total_memory_bytes();
202        let resource_stat = self.resource_stat.clone();
203        let env_vars = self.env_vars.clone();
204        common_runtime::spawn_hb(async move {
205            let sleep = tokio::time::sleep(Duration::from_millis(0));
206            tokio::pin!(sleep);
207
208            let mut extensions = std::collections::HashMap::new();
209            env_vars.into_extensions(&mut extensions);
210
211            let heartbeat_request = HeartbeatRequest {
212                peer: self_peer,
213                info: Self::build_node_info(
214                    start_time_ms,
215                    total_cpu_millicores,
216                    total_memory_bytes,
217                ),
218                node_workloads: Some(NodeWorkloads::Frontend(FrontendWorkloads { types: vec![] })),
219                extensions,
220                ..Default::default()
221            };
222
223            loop {
224                let req = tokio::select! {
225                    message = outgoing_rx.recv() => {
226                        if let Some(message) = message {
227                            Self::new_heartbeat_request(&heartbeat_request, Some(message), 0, 0)
228                        } else {
229                            warn!("Sender has been dropped, exiting the heartbeat loop");
230                            // Receives None that means Sender was dropped, we need to break the current loop
231                            break
232                        }
233                    }
234                    _ = &mut sleep => {
235                       sleep.as_mut().reset(Instant::now() + report_interval);
236                       Self::new_heartbeat_request(&heartbeat_request, None, resource_stat.get_cpu_usage_millicores(), resource_stat.get_memory_usage_bytes())
237                    }
238                };
239
240                if let Some(req) = req {
241                    if let Err(e) = req_sender.send(req.clone()).await {
242                        error!(e; "Failed to send heartbeat to metasrv");
243                        break;
244                    } else {
245                        HEARTBEAT_SENT_COUNT.inc();
246                        debug!("Send a heartbeat request to metasrv, content: {:?}", req);
247                    }
248                }
249            }
250        });
251    }
252
253    async fn handle_response(&self, ctx: HeartbeatResponseHandlerContext) -> Result<()> {
254        self.resp_handler_executor
255            .handle(ctx)
256            .await
257            .context(error::HandleHeartbeatResponseSnafu)
258    }
259
260    async fn start_with_retry(&self, retry_interval: Duration) {
261        loop {
262            tokio::time::sleep(retry_interval).await;
263
264            info!("Try to re-establish the heartbeat connection to metasrv.");
265
266            if self.start().await.is_ok() {
267                break;
268            }
269        }
270    }
271}
272
273pub(crate) fn frontend_peer_addr(opts: &FrontendOptions) -> String {
274    // if internal grpc is configured, use its address as the peer address
275    // otherwise use the public grpc address, because peer address only promises to be reachable
276    // by other components, it doesn't matter whether it's internal or external
277    if let Some(internal) = &opts.internal_grpc {
278        addrs::resolve_addr(&internal.bind_addr, Some(&internal.server_addr))
279    } else {
280        addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr))
281    }
282}