diff --git a/.gitignore b/.gitignore index 3ffd804387..11dd2556ca 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,6 @@ benchmarks/data # dashboard files !/src/servers/dashboard/VERSION /src/servers/dashboard/* + +# Vscode workspace +*.code-workspace diff --git a/Cargo.lock b/Cargo.lock index 0c91587cc0..339c817b34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1751,10 +1751,15 @@ dependencies = [ name = "common-meta" version = "0.2.0" dependencies = [ + "api", + "chrono", + "common-catalog", "common-error", + "datatypes", "serde", "serde_json", "snafu", + "table", ] [[package]] @@ -2220,6 +2225,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "ctor" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd4056f63fce3b82d852c3da92b08ea59959890813a7f4ce9c0ff85b10cf301b" +dependencies = [ + "quote", + "syn 2.0.15", +] + [[package]] name = "cxx" version = "1.0.94" @@ -2846,6 +2861,15 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "erased-serde" +version = "0.3.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2b0c2380453a92ea8b6c8e5f64ecaafccddde8ceab55ff7a8ac1029f894569" +dependencies = [ + "serde", +] + [[package]] name = "errno" version = "0.3.1" @@ -3066,6 +3090,7 @@ dependencies = [ "common-function", "common-grpc", "common-grpc-expr", + "common-meta", "common-query", "common-recordbatch", "common-runtime", @@ -3355,6 +3380,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "ghost" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e77ac7b51b8e6313251737fcef4b1c01a2ea102bde68415b62c0ee9268fec357" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.15", +] + [[package]] name = "gimli" version = "0.27.2" @@ -3871,9 +3907,11 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=6bfb02057c40da0e397c0cb4f6b87bd769669d50#6bfb02057c40da0e397c0cb4f6b87bd769669d50" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=f43972af48f8be1cbb1d27640a8aba8b30955028#f43972af48f8be1cbb1d27640a8aba8b30955028" dependencies = [ "prost", + "serde", + "serde_json", "tonic 0.9.2", "tonic-build 0.9.2", ] @@ -4224,6 +4262,16 @@ dependencies = [ "futures-util", ] +[[package]] +name = "inventory" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7741301a6d6a9b28ce77c0fb77a4eb116b6bc8f3bef09923f7743d059c4157d3" +dependencies = [ + "ctor", + "ghost", +] + [[package]] name = "io-close" version = "0.3.7" @@ -4826,6 +4874,7 @@ dependencies = [ "chrono", "common-error", "common-grpc", + "common-meta", "common-telemetry", "datatypes", "etcd-client", @@ -4853,16 +4902,19 @@ dependencies = [ "async-stream", "async-trait", "catalog", + "chrono", "common-base", "common-catalog", "common-error", "common-grpc", "common-meta", "common-procedure", + "common-procedure-test", "common-runtime", "common-telemetry", "common-time", "dashmap", + "datatypes", "derive_builder 0.12.0", "etcd-client", "futures", @@ -4886,6 +4938,7 @@ dependencies = [ "tower", "tracing", "tracing-subscriber", + "typetag", "url", ] @@ -5784,6 +5837,7 @@ version = "0.2.0" dependencies = [ "common-catalog", "common-error", + "common-meta", "common-query", "datafusion", "datafusion-common", @@ -8787,6 +8841,8 @@ dependencies = [ "common-catalog", "common-error", "common-grpc", + "common-meta", + "common-procedure", "common-query", "common-recordbatch", "common-runtime", @@ -9037,9 +9093,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.0" +version = "1.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" +checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105" dependencies = [ "autocfg", "bytes", @@ -9521,6 +9577,30 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "typetag" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6898cc6f6a32698cc3e14d5632a14d2b23ed9f7b11e6b8e05ce685990acc22" +dependencies = [ + "erased-serde", + "inventory", + "once_cell", + "serde", + "typetag-impl", +] + +[[package]] +name = "typetag-impl" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c3e1c30cedd24fc597f7d37a721efdbdc2b1acae012c1ef1218f4c7c2c0f3e7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.15", +] + [[package]] name = "typify" version = "0.0.11" diff --git a/src/api/Cargo.toml b/src/api/Cargo.toml index 677c15f754..681faa0843 100644 --- a/src/api/Cargo.toml +++ b/src/api/Cargo.toml @@ -10,7 +10,7 @@ common-base = { path = "../common/base" } common-error = { path = "../common/error" } common-time = { path = "../common/time" } datatypes = { path = "../datatypes" } -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "6bfb02057c40da0e397c0cb4f6b87bd769669d50" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "f43972af48f8be1cbb1d27640a8aba8b30955028" } prost.workspace = true snafu = { version = "0.7", features = ["backtraces"] } tonic.workspace = true diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs index 1a11dddae8..bd5266fc16 100644 --- a/src/cmd/src/error.rs +++ b/src/cmd/src/error.rs @@ -154,6 +154,12 @@ pub enum Error { source: ConfigError, location: Location, }, + + #[snafu(display("Failed to start catalog manager, source: {}", source))] + StartCatalogManager { + #[snafu(backtrace)] + source: catalog::error::Error, + }, } pub type Result = std::result::Result; @@ -185,6 +191,7 @@ impl ErrorExt for Error { source.status_code() } Error::SubstraitEncodeLogicalPlan { source } => source.status_code(), + Error::StartCatalogManager { source } => source.status_code(), } } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index 37117895fa..af832ed84c 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -25,7 +25,7 @@ use servers::tls::{TlsMode, TlsOption}; use servers::{auth, Mode}; use snafu::ResultExt; -use crate::error::{self, IllegalAuthConfigSnafu, Result}; +use crate::error::{self, IllegalAuthConfigSnafu, Result, StartCatalogManagerSnafu}; use crate::options::{Options, TopLevelOptions}; pub struct Instance { @@ -34,6 +34,12 @@ pub struct Instance { impl Instance { pub async fn run(&mut self) -> Result<()> { + self.frontend + .catalog_manager() + .start() + .await + .context(StartCatalogManagerSnafu)?; + self.frontend .start() .await diff --git a/src/common/grpc/src/channel_manager.rs b/src/common/grpc/src/channel_manager.rs index 857aff8fd0..977ddb3f0c 100644 --- a/src/common/grpc/src/channel_manager.rs +++ b/src/common/grpc/src/channel_manager.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use std::time::Duration; use common_telemetry::info; @@ -34,7 +34,7 @@ pub struct ChannelManager { config: ChannelConfig, client_tls_config: Option, pool: Arc, - channel_recycle_started: bool, + channel_recycle_started: Arc>, } impl Default for ChannelManager { @@ -54,12 +54,13 @@ impl ChannelManager { config, client_tls_config: None, pool, - channel_recycle_started: false, + channel_recycle_started: Arc::new(Mutex::new(false)), } } - pub fn start_channel_recycle(&mut self) { - if self.channel_recycle_started { + pub fn start_channel_recycle(&self) { + let mut started = self.channel_recycle_started.lock().unwrap(); + if *started { return; } @@ -69,7 +70,7 @@ impl ChannelManager { }); info!("Channel recycle is started, running in the background!"); - self.channel_recycle_started = true; + *started = true; } pub fn with_tls_config(config: ChannelConfig) -> Result { diff --git a/src/common/meta/Cargo.toml b/src/common/meta/Cargo.toml index 6056c748a5..efbcb19aa8 100644 --- a/src/common/meta/Cargo.toml +++ b/src/common/meta/Cargo.toml @@ -5,7 +5,14 @@ edition.workspace = true license.workspace = true [dependencies] +api = { path = "../../api" } +common-catalog = { path = "../catalog" } common-error = { path = "../error" } serde.workspace = true snafu.workspace = true serde_json.workspace = true +table = { path = "../../table" } + +[dev-dependencies] +chrono.workspace = true +datatypes = { path = "../../datatypes" } diff --git a/src/common/meta/src/error.rs b/src/common/meta/src/error.rs new file mode 100644 index 0000000000..187ff4950c --- /dev/null +++ b/src/common/meta/src/error.rs @@ -0,0 +1,51 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_error::prelude::*; +use snafu::Location; + +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum Error { + #[snafu(display("Failed to serde json, source: {}", source))] + SerdeJson { + source: serde_json::error::Error, + location: Location, + }, + + #[snafu(display("Corrupted table route data, err: {}", err_msg))] + RouteInfoCorrupted { err_msg: String, location: Location }, + + #[snafu(display("Illegal state from server, code: {}, error: {}", code, err_msg))] + IllegalServerState { + code: i32, + err_msg: String, + location: Location, + }, +} + +pub type Result = std::result::Result; + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + match self { + Error::IllegalServerState { .. } => StatusCode::Internal, + Error::SerdeJson { .. } | Error::RouteInfoCorrupted { .. } => StatusCode::Unexpected, + } + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs index 6ef85bcd25..8ef3072303 100644 --- a/src/common/meta/src/instruction.rs +++ b/src/common/meta/src/instruction.rs @@ -12,10 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::fmt::{Display, Formatter}; + use serde::{Deserialize, Serialize}; -#[derive(Debug, Serialize, Deserialize)] +use crate::{ClusterId, DatanodeId}; + +#[derive(Eq, Hash, PartialEq, Clone, Debug, Serialize, Deserialize)] pub struct RegionIdent { + pub cluster_id: ClusterId, + pub datanode_id: DatanodeId, pub catalog: String, pub schema: String, pub table: String, @@ -24,12 +30,35 @@ pub struct RegionIdent { pub region_number: u32, } +impl Display for RegionIdent { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "RegionIdent(datanode_id='{}.{}', table_id='{}', table_name='{}.{}.{}', table_engine='{}', region_no='{}')", + self.cluster_id, + self.datanode_id, + self.table_id, + self.catalog, + self.schema, + self.table, + self.engine, + self.region_number + ) + } +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] pub struct SimpleReply { pub result: bool, pub error: Option, } +impl Display for SimpleReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "(result={}, error={:?})", self.result, self.error) + } +} + #[derive(Debug, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum Instruction { @@ -37,6 +66,15 @@ pub enum Instruction { CloseRegion(RegionIdent), } +impl Display for Instruction { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::OpenRegion(region) => write!(f, "Instruction::OpenRegion({})", region), + Self::CloseRegion(region) => write!(f, "Instruction::CloseRegion({})", region), + } + } +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] #[serde(tag = "type", rename_all = "snake_case")] pub enum InstructionReply { @@ -44,6 +82,15 @@ pub enum InstructionReply { CloseRegion(SimpleReply), } +impl Display for InstructionReply { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::OpenRegion(reply) => write!(f, "InstructionReply::OpenRegion({})", reply), + Self::CloseRegion(reply) => write!(f, "InstructionReply::CloseRegion({})", reply), + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -51,6 +98,8 @@ mod tests { #[test] fn test_serialize_instruction() { let open_region = Instruction::OpenRegion(RegionIdent { + cluster_id: 1, + datanode_id: 2, catalog: "foo".to_string(), schema: "bar".to_string(), table: "hi".to_string(), @@ -62,11 +111,13 @@ mod tests { let serialized = serde_json::to_string(&open_region).unwrap(); assert_eq!( - r#"{"type":"open_region","catalog":"foo","schema":"bar","table":"hi","table_id":1024,"engine":"mito","region_number":1}"#, + r#"{"type":"open_region","cluster_id":1,"datanode_id":2,"catalog":"foo","schema":"bar","table":"hi","table_id":1024,"engine":"mito","region_number":1}"#, serialized ); let close_region = Instruction::CloseRegion(RegionIdent { + cluster_id: 1, + datanode_id: 2, catalog: "foo".to_string(), schema: "bar".to_string(), table: "hi".to_string(), @@ -78,7 +129,7 @@ mod tests { let serialized = serde_json::to_string(&close_region).unwrap(); assert_eq!( - r#"{"type":"close_region","catalog":"foo","schema":"bar","table":"hi","table_id":1024,"engine":"mito","region_number":1}"#, + r#"{"type":"close_region","cluster_id":1,"datanode_id":2,"catalog":"foo","schema":"bar","table":"hi","table_id":1024,"engine":"mito","region_number":1}"#, serialized ); } diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index d817e21fd7..9c8a5db6f3 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -12,4 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod error; pub mod instruction; +pub mod peer; +pub mod router; +pub mod table_name; +pub mod util; + +pub type ClusterId = u64; +pub type DatanodeId = u64; + +pub use instruction::RegionIdent; diff --git a/src/common/meta/src/peer.rs b/src/common/meta/src/peer.rs new file mode 100644 index 0000000000..1f68dcbec1 --- /dev/null +++ b/src/common/meta/src/peer.rs @@ -0,0 +1,49 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::Peer as PbPeer; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)] +pub struct Peer { + pub id: u64, + pub addr: String, +} + +impl From for Peer { + fn from(p: PbPeer) -> Self { + Self { + id: p.id, + addr: p.addr, + } + } +} + +impl From for PbPeer { + fn from(p: Peer) -> Self { + Self { + id: p.id, + addr: p.addr, + } + } +} + +impl Peer { + pub fn new(id: u64, addr: impl Into) -> Self { + Self { + id, + addr: addr.into(), + } + } +} diff --git a/src/meta-client/src/rpc/router.rs b/src/common/meta/src/router.rs similarity index 64% rename from src/meta-client/src/rpc/router.rs rename to src/common/meta/src/router.rs index 2fdc252331..71332d47f2 100644 --- a/src/meta-client/src/rpc/router.rs +++ b/src/common/meta/src/router.rs @@ -16,16 +16,18 @@ use std::collections::{HashMap, HashSet}; use api::v1::meta::{ CreateRequest as PbCreateRequest, DeleteRequest as PbDeleteRequest, Partition as PbPartition, - Region as PbRegion, RouteRequest as PbRouteRequest, RouteResponse as PbRouteResponse, - Table as PbTable, + Peer as PbPeer, Region as PbRegion, RegionRoute as PbRegionRoute, + RouteRequest as PbRouteRequest, RouteResponse as PbRouteResponse, Table as PbTable, + TableRoute as PbTableRoute, }; use serde::{Deserialize, Serialize, Serializer}; use snafu::{OptionExt, ResultExt}; use table::metadata::RawTableInfo; -use crate::error; -use crate::error::Result; -use crate::rpc::{util, Peer, TableName}; +use crate::error::{self, Result}; +use crate::peer::Peer; +use crate::table_name::TableName; +use crate::util; #[derive(Debug, Clone)] pub struct CreateRequest<'a> { @@ -125,57 +127,124 @@ impl TryFrom for RouteResponse { fn try_from(pb: PbRouteResponse) -> Result { util::check_response_header(pb.header.as_ref())?; - let peers: Vec = pb.peers.into_iter().map(Into::into).collect(); - let get_peer = |index: u64| peers.get(index as usize).map(ToOwned::to_owned); - let mut table_routes = Vec::with_capacity(pb.table_routes.len()); - for table_route in pb.table_routes.into_iter() { - let table = table_route - .table - .context(error::RouteInfoCorruptedSnafu { - err_msg: "table required", - })? - .try_into()?; - - let mut region_routes = Vec::with_capacity(table_route.region_routes.len()); - for region_route in table_route.region_routes.into_iter() { - let region = region_route - .region - .context(error::RouteInfoCorruptedSnafu { - err_msg: "'region' not found", - })? - .into(); - - let leader_peer = get_peer(region_route.leader_peer_index); - let follower_peers = region_route - .follower_peer_indexes - .into_iter() - .filter_map(get_peer) - .collect::>(); - - region_routes.push(RegionRoute { - region, - leader_peer, - follower_peers, - }); - } - - table_routes.push(TableRoute { - table, - region_routes, - }); - } - + let table_routes = pb + .table_routes + .into_iter() + .map(|x| TableRoute::try_from_raw(&pb.peers, x)) + .collect::>>()?; Ok(Self { table_routes }) } } -#[derive(Debug, Clone, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct TableRoute { pub table: Table, pub region_routes: Vec, } impl TableRoute { + pub fn try_from_raw(peers: &[PbPeer], table_route: PbTableRoute) -> Result { + let table = table_route + .table + .context(error::RouteInfoCorruptedSnafu { + err_msg: "'table' is empty in table route", + })? + .try_into()?; + + let mut region_routes = Vec::with_capacity(table_route.region_routes.len()); + for region_route in table_route.region_routes.into_iter() { + let region = region_route + .region + .context(error::RouteInfoCorruptedSnafu { + err_msg: "'region' is empty in region route", + })? + .into(); + + let leader_peer = peers + .get(region_route.leader_peer_index as usize) + .cloned() + .map(Into::into); + + let follower_peers = region_route + .follower_peer_indexes + .into_iter() + .filter_map(|x| peers.get(x as usize).cloned().map(Into::into)) + .collect::>(); + + region_routes.push(RegionRoute { + region, + leader_peer, + follower_peers, + }); + } + + Ok(Self { + table, + region_routes, + }) + } + + pub fn try_into_raw(self) -> Result<(Vec, PbTableRoute)> { + let mut peers = HashSet::new(); + self.region_routes + .iter() + .filter_map(|x| x.leader_peer.as_ref()) + .for_each(|p| { + peers.insert(p.clone()); + }); + self.region_routes + .iter() + .flat_map(|x| x.follower_peers.iter()) + .for_each(|p| { + peers.insert(p.clone()); + }); + let mut peers = peers.into_iter().map(Into::into).collect::>(); + peers.sort_by_key(|x| x.id); + + let find_peer = |peer_id: u64| -> u64 { + peers + .iter() + .enumerate() + .find_map(|(i, x)| { + if x.id == peer_id { + Some(i as u64) + } else { + None + } + }) + .unwrap_or_else(|| { + panic!("Peer {peer_id} must be present when collecting all peers.") + }) + }; + + let mut region_routes = Vec::with_capacity(self.region_routes.len()); + for region_route in self.region_routes.into_iter() { + let leader_peer_index = region_route.leader_peer.map(|x| find_peer(x.id)).context( + error::RouteInfoCorruptedSnafu { + err_msg: "'leader_peer' is empty in region route", + }, + )?; + + let follower_peer_indexes = region_route + .follower_peers + .iter() + .map(|x| find_peer(x.id)) + .collect::>(); + + region_routes.push(PbRegionRoute { + region: Some(region_route.region.into()), + leader_peer_index, + follower_peer_indexes, + }); + } + + let table_route = PbTableRoute { + table: Some(self.table.into()), + region_routes, + }; + Ok((peers, table_route)) + } + pub fn find_leaders(&self) -> HashSet { self.region_routes .iter() @@ -199,7 +268,7 @@ impl TableRoute { } } -#[derive(Debug, Clone, Deserialize, Serialize)] +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct Table { pub id: u64, pub table_name: TableName, @@ -225,14 +294,24 @@ impl TryFrom for Table { } } -#[derive(Debug, Clone, Default, Deserialize, Serialize)] +impl From for PbTable { + fn from(table: Table) -> Self { + PbTable { + id: table.id, + table_name: Some(table.table_name.into()), + table_schema: table.table_schema, + } + } +} + +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq)] pub struct RegionRoute { pub region: Region, pub leader_peer: Option, pub follower_peers: Vec, } -#[derive(Debug, Clone, Default, Deserialize, Serialize)] +#[derive(Debug, Clone, Default, Deserialize, Serialize, PartialEq)] pub struct Region { pub id: u64, pub name: String, @@ -251,7 +330,18 @@ impl From for Region { } } -#[derive(Debug, Clone, Deserialize, Serialize)] +impl From for PbRegion { + fn from(region: Region) -> Self { + Self { + id: region.id, + name: region.name, + partition: region.partition.map(Into::into), + attrs: region.attrs, + } + } +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq)] pub struct Partition { #[serde(serialize_with = "as_utf8_vec")] pub column_list: Vec>, @@ -495,4 +585,97 @@ mod tests { assert_eq!(2, region_route.follower_peers.get(0).unwrap().id); assert_eq!("peer2", region_route.follower_peers.get(0).unwrap().addr); } + + #[test] + fn test_table_route_raw_conversion() { + let raw_peers = vec![ + PbPeer { + id: 1, + addr: "a1".to_string(), + }, + PbPeer { + id: 2, + addr: "a2".to_string(), + }, + PbPeer { + id: 3, + addr: "a3".to_string(), + }, + ]; + + // region distribution: + // region id => leader peer id + [follower peer id] + // 1 => 2 + [1, 3] + // 2 => 1 + [2, 3] + + let raw_table_route = PbTableRoute { + table: Some(PbTable { + id: 1, + table_name: Some(PbTableName { + catalog_name: "c1".to_string(), + schema_name: "s1".to_string(), + table_name: "t1".to_string(), + }), + table_schema: vec![], + }), + region_routes: vec![ + PbRegionRoute { + region: Some(PbRegion { + id: 1, + name: "r1".to_string(), + partition: None, + attrs: HashMap::new(), + }), + leader_peer_index: 1, + follower_peer_indexes: vec![0, 2], + }, + PbRegionRoute { + region: Some(PbRegion { + id: 2, + name: "r2".to_string(), + partition: None, + attrs: HashMap::new(), + }), + leader_peer_index: 0, + follower_peer_indexes: vec![1, 2], + }, + ], + }; + let table_route = TableRoute { + table: Table { + id: 1, + table_name: TableName::new("c1", "s1", "t1"), + table_schema: vec![], + }, + region_routes: vec![ + RegionRoute { + region: Region { + id: 1, + name: "r1".to_string(), + partition: None, + attrs: HashMap::new(), + }, + leader_peer: Some(Peer::new(2, "a2")), + follower_peers: vec![Peer::new(1, "a1"), Peer::new(3, "a3")], + }, + RegionRoute { + region: Region { + id: 2, + name: "r2".to_string(), + partition: None, + attrs: HashMap::new(), + }, + leader_peer: Some(Peer::new(1, "a1")), + follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")], + }, + ], + }; + + let from_raw = TableRoute::try_from_raw(&raw_peers, raw_table_route.clone()).unwrap(); + assert_eq!(from_raw, table_route); + + let into_raw = table_route.try_into_raw().unwrap(); + assert_eq!(into_raw.0, raw_peers); + assert_eq!(into_raw.1, raw_table_route); + } } diff --git a/src/common/meta/src/table_name.rs b/src/common/meta/src/table_name.rs new file mode 100644 index 0000000000..78734466a7 --- /dev/null +++ b/src/common/meta/src/table_name.rs @@ -0,0 +1,69 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Display, Formatter}; + +use api::v1::meta::TableName as PbTableName; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)] +pub struct TableName { + pub catalog_name: String, + pub schema_name: String, + pub table_name: String, +} + +impl Display for TableName { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(&common_catalog::format_full_table_name( + &self.catalog_name, + &self.schema_name, + &self.table_name, + )) + } +} + +impl TableName { + pub fn new( + catalog_name: impl Into, + schema_name: impl Into, + table_name: impl Into, + ) -> Self { + Self { + catalog_name: catalog_name.into(), + schema_name: schema_name.into(), + table_name: table_name.into(), + } + } +} + +impl From for PbTableName { + fn from(table_name: TableName) -> Self { + Self { + catalog_name: table_name.catalog_name, + schema_name: table_name.schema_name, + table_name: table_name.table_name, + } + } +} + +impl From for TableName { + fn from(table_name: PbTableName) -> Self { + Self { + catalog_name: table_name.catalog_name, + schema_name: table_name.schema_name, + table_name: table_name.table_name, + } + } +} diff --git a/src/meta-client/src/rpc/util.rs b/src/common/meta/src/util.rs similarity index 93% rename from src/meta-client/src/rpc/util.rs rename to src/common/meta/src/util.rs index 59fde86481..3df5a8630c 100644 --- a/src/meta-client/src/rpc/util.rs +++ b/src/common/meta/src/util.rs @@ -18,7 +18,7 @@ use crate::error; use crate::error::Result; #[inline] -pub(crate) fn check_response_header(header: Option<&ResponseHeader>) -> Result<()> { +pub fn check_response_header(header: Option<&ResponseHeader>) -> Result<()> { if let Some(header) = header { if let Some(error) = &header.error { let code = error.code; diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs index 96f8908138..ff84a7db0b 100644 --- a/src/common/telemetry/src/logging.rs +++ b/src/common/telemetry/src/logging.rs @@ -136,6 +136,7 @@ pub fn init_global_logging( .with_target("reqwest", Level::WARN) .with_target("sqlparser", Level::WARN) .with_target("h2", Level::INFO) + .with_target("opendal", Level::INFO) .with_default( directives .parse::() diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 4e0f9f6d9b..64c82a930c 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -81,6 +81,8 @@ impl HeartbeatTask { handler_executor: HeartbeatResponseHandlerExecutorRef, mailbox: MailboxRef, ) -> Result { + let client_id = meta_client.id(); + let (tx, mut rx) = meta_client.heartbeat().await.context(MetaClientInitSnafu)?; common_runtime::spawn_bg(async move { while let Some(res) = match rx.message().await { @@ -90,6 +92,10 @@ impl HeartbeatTask { None } } { + if let Some(msg) = res.mailbox_message.as_ref() { + info!("Received mailbox message: {msg:?}, meta_client id: {client_id:?}"); + } + let ctx = HeartbeatResponseHandlerContext::new(mailbox.clone(), res); if let Err(e) = Self::handle_response(ctx, handler_executor.clone()) { error!(e;"Error while handling heartbeat response"); @@ -124,6 +130,8 @@ impl HeartbeatTask { let interval = self.interval; let node_id = self.node_id; let addr = resolve_addr(&self.server_addr, &self.server_hostname); + info!("Starting heartbeat to Metasrv with interval {interval}. My node id is {node_id}, address is {addr}."); + let meta_client = self.meta_client.clone(); let catalog_manager_clone = self.catalog_manager.clone(); diff --git a/src/datanode/src/heartbeat/handler/open_region.rs b/src/datanode/src/heartbeat/handler/open_region.rs index c48a3f8172..a1172b68cd 100644 --- a/src/datanode/src/heartbeat/handler/open_region.rs +++ b/src/datanode/src/heartbeat/handler/open_region.rs @@ -14,6 +14,7 @@ use std::sync::Arc; +use catalog::error::Error as CatalogError; use catalog::{CatalogManagerRef, RegisterTableRequest}; use common_catalog::format_full_table_name; use common_meta::instruction::{Instruction, InstructionReply, RegionIdent, SimpleReply}; @@ -66,6 +67,16 @@ impl HeartbeatResponseHandler for OpenRegionHandler { } impl OpenRegionHandler { + pub fn new( + catalog_manager: CatalogManagerRef, + table_engine_manager: TableEngineManagerRef, + ) -> Self { + Self { + catalog_manager, + table_engine_manager, + } + } + fn map_result(result: Result) -> InstructionReply { result.map_or_else( |error| { @@ -91,6 +102,7 @@ impl OpenRegionHandler { table_id, region_number, engine, + .. } = ident; ( @@ -181,13 +193,13 @@ impl OpenRegionHandler { table_id: request.table_id, table, }; - self.catalog_manager - .register_table(request) - .await - .with_context(|_| error::RegisterTableSnafu { + let result = self.catalog_manager.register_table(request).await; + match result { + Ok(_) | Err(CatalogError::TableExists { .. }) => Ok(true), + e => e.with_context(|_| error::RegisterTableSnafu { table_name: format_full_table_name(catalog_name, schema_name, table_name), - })?; - Ok(true) + }), + } } else { // Case 1: // TODO(weny): Fix/Cleanup the broken table manifest diff --git a/src/datanode/src/instance.rs b/src/datanode/src/instance.rs index 2ba9f02eae..c193a0601e 100644 --- a/src/datanode/src/instance.rs +++ b/src/datanode/src/instance.rs @@ -64,6 +64,7 @@ use crate::error::{ NewCatalogSnafu, OpenLogStoreSnafu, RecoverProcedureSnafu, Result, ShutdownInstanceSnafu, StartProcedureManagerSnafu, StopProcedureManagerSnafu, }; +use crate::heartbeat::handler::open_region::OpenRegionHandler; use crate::heartbeat::handler::parse_mailbox_message::ParseMailboxMessageHandler; use crate::heartbeat::handler::HandlerGroupExecutor; use crate::heartbeat::HeartbeatTask; @@ -203,9 +204,6 @@ impl Instance { let factory = QueryEngineFactory::new(catalog_manager.clone(), false); let query_engine = factory.query_engine(); - let handlder_executor = - HandlerGroupExecutor::new(vec![Arc::new(ParseMailboxMessageHandler::default())]); - let heartbeat_task = match opts.mode { Mode::Standalone => None, Mode::Distributed => Some(HeartbeatTask::new( @@ -214,7 +212,13 @@ impl Instance { opts.rpc_hostname.clone(), meta_client.as_ref().unwrap().clone(), catalog_manager.clone(), - Arc::new(handlder_executor), + Arc::new(HandlerGroupExecutor::new(vec![ + Arc::new(ParseMailboxMessageHandler::default()), + Arc::new(OpenRegionHandler::new( + catalog_manager.clone(), + engine_manager.clone(), + )), + ])), )), }; @@ -521,7 +525,7 @@ async fn new_metasrv_client(node_id: u64, meta_config: &MetaClientOptions) -> Re .connect_timeout(Duration::from_millis(meta_config.connect_timeout_millis)) .tcp_nodelay(meta_config.tcp_nodelay); - let mut channel_manager = ChannelManager::with_config(config); + let channel_manager = ChannelManager::with_config(config); channel_manager.start_channel_recycle(); let mut meta_client = MetaClientBuilder::new(cluster_id, member_id, Role::Datanode) diff --git a/src/datanode/src/mock.rs b/src/datanode/src/mock.rs index 9ffd562abb..ed451580e3 100644 --- a/src/datanode/src/mock.rs +++ b/src/datanode/src/mock.rs @@ -40,6 +40,7 @@ async fn mock_meta_client(mock_info: MockInfo, node_id: u64) -> MetaClient { let MockInfo { server_addr, channel_manager, + .. } = mock_info; let id = (1000u64, 2000u64); diff --git a/src/datanode/src/sql.rs b/src/datanode/src/sql.rs index 854b7261c5..f5fd04978e 100644 --- a/src/datanode/src/sql.rs +++ b/src/datanode/src/sql.rs @@ -67,10 +67,6 @@ impl SqlHandler { } } - // TODO(LFC): Refactor consideration: a context awareness "Planner". - // Now we have some query related state (like current using database in session context), maybe - // we could create a new struct called `Planner` that stores context and handle these queries - // there, instead of executing here in a "static" fashion. pub async fn execute(&self, request: SqlRequest, query_ctx: QueryContextRef) -> Result { let result = match request { SqlRequest::CreateTable(req) => self.create_table(req).await, diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 03f402289d..64351240d3 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -25,6 +25,7 @@ common-function = { path = "../common/function" } common-grpc = { path = "../common/grpc" } common-grpc-expr = { path = "../common/grpc-expr" } common-query = { path = "../common/query" } +common-meta = { path = "../common/meta" } common-recordbatch = { path = "../common/recordbatch" } common-runtime = { path = "../common/runtime" } common-telemetry = { path = "../common/telemetry" } diff --git a/src/frontend/src/catalog.rs b/src/frontend/src/catalog.rs index d87a0dc919..e90c1832ed 100644 --- a/src/frontend/src/catalog.rs +++ b/src/frontend/src/catalog.rs @@ -34,10 +34,10 @@ use catalog::{ }; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_error::prelude::BoxedError; +use common_meta::table_name::TableName; use common_telemetry::warn; use futures::StreamExt; use futures_util::TryStreamExt; -use meta_client::rpc::TableName; use partition::manager::PartitionRuleManagerRef; use snafu::prelude::*; use table::table::numbers::NumbersTable; @@ -97,6 +97,7 @@ impl FrontendCatalogManager { #[async_trait::async_trait] impl CatalogManager for FrontendCatalogManager { async fn start(&self) -> catalog::error::Result<()> { + self.datanode_clients.start(); Ok(()) } diff --git a/src/frontend/src/datanode.rs b/src/frontend/src/datanode.rs index 969026017f..0b1c5f7e38 100644 --- a/src/frontend/src/datanode.rs +++ b/src/frontend/src/datanode.rs @@ -12,18 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::{Arc, Mutex}; use std::time::Duration; use client::Client; use common_grpc::channel_manager::ChannelManager; +use common_meta::peer::Peer; use common_telemetry::info; -use meta_client::rpc::Peer; use moka::future::{Cache, CacheBuilder}; pub struct DatanodeClients { channel_manager: ChannelManager, clients: Cache, - started: bool, + started: Arc>, } impl Default for DatanodeClients { @@ -34,21 +35,22 @@ impl Default for DatanodeClients { .time_to_live(Duration::from_secs(30 * 60)) .time_to_idle(Duration::from_secs(5 * 60)) .build(), - started: false, + started: Arc::new(Mutex::new(false)), } } } impl DatanodeClients { - pub(crate) fn start(&mut self) { - if self.started { + pub(crate) fn start(&self) { + let mut started = self.started.lock().unwrap(); + if *started { return; } self.channel_manager.start_channel_recycle(); info!("Datanode clients manager is started!"); - self.started = true; + *started = true; } pub(crate) async fn get_client(&self, datanode: &Peer) -> Client { diff --git a/src/frontend/src/instance.rs b/src/frontend/src/instance.rs index e230a5fcd5..741f6ed98b 100644 --- a/src/frontend/src/instance.rs +++ b/src/frontend/src/instance.rs @@ -127,16 +127,22 @@ impl Instance { ) -> Result { let meta_client = Self::create_meta_client(opts).await?; + let datanode_clients = Arc::new(DatanodeClients::default()); + + Self::try_new_distributed_with(meta_client, datanode_clients, plugins).await + } + + pub async fn try_new_distributed_with( + meta_client: Arc, + datanode_clients: Arc, + plugins: Arc, + ) -> Result { let meta_backend = Arc::new(MetaKvBackend { client: meta_client.clone(), }); let table_routes = Arc::new(TableRoutes::new(meta_client.clone())); let partition_manager = Arc::new(PartitionRuleManager::new(table_routes)); - let mut datanode_clients = DatanodeClients::default(); - datanode_clients.start(); - let datanode_clients = Arc::new(datanode_clients); - let mut catalog_manager = FrontendCatalogManager::new(meta_backend, partition_manager, datanode_clients.clone()); @@ -197,7 +203,7 @@ impl Instance { .connect_timeout(Duration::from_millis(meta_config.connect_timeout_millis)) .tcp_nodelay(meta_config.tcp_nodelay); - let mut channel_manager = ChannelManager::with_config(channel_config); + let channel_manager = ChannelManager::with_config(channel_config); channel_manager.start_channel_recycle(); let mut meta_client = MetaClientBuilder::new(0, 0, Role::Frontend) @@ -245,36 +251,6 @@ impl Instance { Ok(()) } - pub async fn new_distributed( - catalog_manager: CatalogManagerRef, - dist_instance: Arc, - ) -> Self { - let query_engine = QueryEngineFactory::new(catalog_manager.clone(), false).query_engine(); - let script_executor = Arc::new( - ScriptExecutor::new(catalog_manager.clone(), query_engine.clone()) - .await - .unwrap(), - ); - - let statement_executor = Arc::new(StatementExecutor::new( - catalog_manager.clone(), - query_engine.clone(), - dist_instance.clone(), - )); - - Instance { - catalog_manager, - script_executor, - statement_executor, - query_engine, - create_expr_factory: Arc::new(DefaultCreateExprFactory), - grpc_query_handler: dist_instance, - plugins: Default::default(), - servers: Arc::new(HashMap::new()), - heartbeat_task: None, - } - } - pub fn catalog_manager(&self) -> &CatalogManagerRef { &self.catalog_manager } diff --git a/src/frontend/src/instance/distributed.rs b/src/frontend/src/instance/distributed.rs index e29aba8570..00e03a5951 100644 --- a/src/frontend/src/instance/distributed.rs +++ b/src/frontend/src/instance/distributed.rs @@ -30,6 +30,11 @@ use client::Database; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_catalog::format_full_table_name; use common_error::prelude::BoxedError; +use common_meta::router::{ + CreateRequest as MetaCreateRequest, DeleteRequest as MetaDeleteRequest, + Partition as MetaPartition, RouteRequest, RouteResponse, +}; +use common_meta::table_name::TableName; use common_query::Output; use common_telemetry::debug; use datanode::instance::sql::table_idents_to_full_name; @@ -37,11 +42,7 @@ use datanode::sql::SqlHandler; use datatypes::prelude::ConcreteDataType; use datatypes::schema::RawSchema; use meta_client::client::MetaClient; -use meta_client::rpc::router::DeleteRequest as MetaDeleteRequest; -use meta_client::rpc::{ - CompareAndPutRequest, CreateRequest as MetaCreateRequest, Partition as MetaPartition, - RouteRequest, RouteResponse, TableName, -}; +use meta_client::rpc::CompareAndPutRequest; use partition::manager::PartitionInfo; use partition::partition::{PartitionBound, PartitionDef}; use query::error::QueryExecutionSnafu; diff --git a/src/frontend/src/table.rs b/src/frontend/src/table.rs index 9e2013419e..8a467a78ec 100644 --- a/src/frontend/src/table.rs +++ b/src/frontend/src/table.rs @@ -22,6 +22,7 @@ use catalog::helper::{TableGlobalKey, TableGlobalValue}; use catalog::remote::KvBackendRef; use client::Database; use common_error::prelude::BoxedError; +use common_meta::table_name::TableName; use common_query::error::Result as QueryResult; use common_query::logical_plan::Expr; use common_query::physical_plan::{PhysicalPlan, PhysicalPlanRef}; @@ -35,7 +36,6 @@ use datafusion::physical_plan::{ }; use datafusion_common::DataFusionError; use datatypes::schema::{ColumnSchema, Schema, SchemaRef}; -use meta_client::rpc::TableName; use partition::manager::PartitionRuleManagerRef; use partition::splitter::WriteSplitter; use snafu::prelude::*; @@ -498,11 +498,10 @@ mod test { use catalog::error::Result; use catalog::remote::{KvBackend, ValueIter}; + use common_meta::router::{Region, RegionRoute, Table, TableRoute}; use datafusion_expr::expr_fn::{and, binary_expr, col, or}; use datafusion_expr::{lit, Operator}; use meta_client::client::MetaClient; - use meta_client::rpc::router::RegionRoute; - use meta_client::rpc::{Region, Table, TableRoute}; use meter_core::collect::Collect; use meter_core::data::{ReadRecord, WriteRecord}; use meter_core::global::global_registry; diff --git a/src/frontend/src/table/delete.rs b/src/frontend/src/table/delete.rs index 481a72cc8e..726090efb7 100644 --- a/src/frontend/src/table/delete.rs +++ b/src/frontend/src/table/delete.rs @@ -13,9 +13,9 @@ // limitations under the License. use api::v1::DeleteRequest as GrpcDeleteRequest; +use common_meta::table_name::TableName; use common_query::Output; use futures::future; -use meta_client::rpc::TableName; use snafu::ResultExt; use store_api::storage::RegionNumber; use table::requests::DeleteRequest; diff --git a/src/frontend/src/table/scan.rs b/src/frontend/src/table/scan.rs index b8f2918066..f933890135 100644 --- a/src/frontend/src/table/scan.rs +++ b/src/frontend/src/table/scan.rs @@ -17,12 +17,12 @@ use std::sync::Arc; use api::v1::{DeleteRequest, InsertRequest}; use client::Database; +use common_meta::table_name::TableName; use common_query::prelude::Expr; use common_query::Output; use common_recordbatch::RecordBatches; use datafusion::datasource::DefaultTableSource; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use meta_client::rpc::TableName; use snafu::ResultExt; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; use table::table::adapter::DfTableProviderAdapter; diff --git a/src/meta-client/Cargo.toml b/src/meta-client/Cargo.toml index fd7ee25e82..3785db611e 100644 --- a/src/meta-client/Cargo.toml +++ b/src/meta-client/Cargo.toml @@ -11,6 +11,7 @@ chrono.workspace = true common-error = { path = "../common/error" } common-grpc = { path = "../common/grpc" } common-telemetry = { path = "../common/telemetry" } +common-meta = { path = "../common/meta" } etcd-client = "0.10" rand.workspace = true serde.workspace = true diff --git a/src/meta-client/examples/meta_client.rs b/src/meta-client/examples/meta_client.rs index f48bab7ea3..c8fbc4b1ac 100644 --- a/src/meta-client/examples/meta_client.rs +++ b/src/meta-client/examples/meta_client.rs @@ -18,12 +18,14 @@ use std::time::Duration; use api::v1::meta::{HeartbeatRequest, Peer, Role}; use chrono::DateTime; use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; +use common_meta::router::{CreateRequest, Partition}; +use common_meta::table_name::TableName; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, RawSchema}; use meta_client::client::MetaClientBuilder; use meta_client::rpc::{ - BatchDeleteRequest, BatchGetRequest, BatchPutRequest, CompareAndPutRequest, CreateRequest, - DeleteRangeRequest, Partition, PutRequest, RangeRequest, TableName, + BatchDeleteRequest, BatchGetRequest, BatchPutRequest, CompareAndPutRequest, DeleteRangeRequest, + PutRequest, RangeRequest, }; use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType}; use table::requests::TableOptions; diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs index c48bc8e885..e978d56405 100644 --- a/src/meta-client/src/client.rs +++ b/src/meta-client/src/client.rs @@ -20,23 +20,23 @@ mod store; use api::v1::meta::Role; use common_grpc::channel_manager::{ChannelConfig, ChannelManager}; +use common_meta::router::{CreateRequest, DeleteRequest, RouteRequest, RouteResponse}; use common_telemetry::info; use heartbeat::Client as HeartbeatClient; use lock::Client as LockClient; use router::Client as RouterClient; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; use store::Client as StoreClient; pub use self::heartbeat::{HeartbeatSender, HeartbeatStream}; use crate::error; -use crate::error::Result; +use crate::error::{ConvertMetaRequestSnafu, ConvertMetaResponseSnafu, Result}; use crate::rpc::lock::{LockRequest, LockResponse, UnlockRequest}; -use crate::rpc::router::DeleteRequest; use crate::rpc::{ BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest, - BatchPutResponse, CompareAndPutRequest, CompareAndPutResponse, CreateRequest, - DeleteRangeRequest, DeleteRangeResponse, MoveValueRequest, MoveValueResponse, PutRequest, - PutResponse, RangeRequest, RangeResponse, RouteRequest, RouteResponse, + BatchPutResponse, CompareAndPutRequest, CompareAndPutResponse, DeleteRangeRequest, + DeleteRangeResponse, MoveValueRequest, MoveValueResponse, PutRequest, PutResponse, + RangeRequest, RangeResponse, }; pub type Id = (u64, u64); @@ -203,10 +203,12 @@ impl MetaClient { /// information contained in the request and using some intelligent policies, /// such as load-based. pub async fn create_route(&self, req: CreateRequest<'_>) -> Result { + let req = req.try_into().context(ConvertMetaRequestSnafu)?; self.router_client()? - .create(req.try_into()?) + .create(req) .await? .try_into() + .context(ConvertMetaResponseSnafu) } /// Fetch routing information for tables. The smallest unit is the complete @@ -229,14 +231,22 @@ impl MetaClient { /// ``` /// pub async fn route(&self, req: RouteRequest) -> Result { - self.router_client()?.route(req.into()).await?.try_into() + self.router_client()? + .route(req.into()) + .await? + .try_into() + .context(ConvertMetaResponseSnafu) } /// Can be called repeatedly, the first call will delete and return the /// table of routing information, the nth call can still return the /// deleted route information. pub async fn delete_route(&self, req: DeleteRequest) -> Result { - self.router_client()?.delete(req.into()).await?.try_into() + self.router_client()? + .delete(req.into()) + .await? + .try_into() + .context(ConvertMetaResponseSnafu) } /// Range gets the keys in the range from the key-value store. @@ -350,9 +360,11 @@ mod tests { use api::v1::meta::{HeartbeatRequest, Peer}; use chrono::DateTime; + use common_meta::router::Partition; + use common_meta::table_name::TableName; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, RawSchema}; - use meta_srv::metasrv::Context; + use meta_srv::metasrv::SelectorContext; use meta_srv::selector::{Namespace, Selector}; use meta_srv::Result as MetaResult; use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType}; @@ -360,7 +372,6 @@ mod tests { use super::*; use crate::mocks; - use crate::rpc::{Partition, TableName}; const TEST_KEY_PREFIX: &str = "__unit_test__meta__"; @@ -570,7 +581,7 @@ mod tests { #[async_trait::async_trait] impl Selector for MockSelector { - type Context = Context; + type Context = SelectorContext; type Output = Vec; async fn select(&self, _ns: Namespace, _ctx: &Self::Context) -> MetaResult { diff --git a/src/meta-client/src/client/heartbeat.rs b/src/meta-client/src/client/heartbeat.rs index 72129a9c1e..8e13671ee5 100644 --- a/src/meta-client/src/client/heartbeat.rs +++ b/src/meta-client/src/client/heartbeat.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use api::v1::meta::heartbeat_client::HeartbeatClient; use api::v1::meta::{AskLeaderRequest, HeartbeatRequest, HeartbeatResponse, RequestHeader, Role}; use common_grpc::channel_manager::ChannelManager; +use common_meta::util; use common_telemetry::{debug, info}; use snafu::{ensure, OptionExt, ResultExt}; use tokio::sync::{mpsc, RwLock}; @@ -27,8 +28,7 @@ use tonic::Streaming; use crate::client::Id; use crate::error; -use crate::error::Result; -use crate::rpc::util; +use crate::error::{InvalidResponseHeaderSnafu, Result}; pub struct HeartbeatSender { id: Id, @@ -81,7 +81,8 @@ impl HeartbeatStream { pub async fn message(&mut self) -> Result> { let res = self.stream.message().await.context(error::TonicStatusSnafu); if let Ok(Some(heartbeat)) = &res { - util::check_response_header(heartbeat.header.as_ref())?; + util::check_response_header(heartbeat.header.as_ref()) + .context(InvalidResponseHeaderSnafu)?; } res } diff --git a/src/meta-client/src/error.rs b/src/meta-client/src/error.rs index ebca90801f..c49dbf72ed 100644 --- a/src/meta-client/src/error.rs +++ b/src/meta-client/src/error.rs @@ -55,20 +55,22 @@ pub enum Error { #[snafu(display("Failed create heartbeat stream to server"))] CreateHeartbeatStream { location: Location }, - #[snafu(display("Route info corrupted: {}", err_msg))] - RouteInfoCorrupted { err_msg: String, location: Location }, - - #[snafu(display("Illegal state from server, code: {}, error: {}", code, err_msg))] - IllegalServerState { - code: i32, - err_msg: String, - location: Location, + #[snafu(display("Invalid response header, source: {}", source))] + InvalidResponseHeader { + #[snafu(backtrace)] + source: common_meta::error::Error, }, - #[snafu(display("Failed to serde json, source: {}", source))] - SerdeJson { - source: serde_json::error::Error, - location: Location, + #[snafu(display("Failed to convert Metasrv request, source: {}", source))] + ConvertMetaRequest { + #[snafu(backtrace)] + source: common_meta::error::Error, + }, + + #[snafu(display("Failed to convert Metasrv response, source: {}", source))] + ConvertMetaResponse { + #[snafu(backtrace)] + source: common_meta::error::Error, }, } @@ -90,10 +92,11 @@ impl ErrorExt for Error { | Error::NotStarted { .. } | Error::SendHeartbeat { .. } | Error::CreateHeartbeatStream { .. } - | Error::CreateChannel { .. } - | Error::IllegalServerState { .. } - | Error::SerdeJson { .. } => StatusCode::Internal, - Error::RouteInfoCorrupted { .. } => StatusCode::Unexpected, + | Error::CreateChannel { .. } => StatusCode::Internal, + + Error::InvalidResponseHeader { source } + | Error::ConvertMetaRequest { source } + | Error::ConvertMetaResponse { source } => source.status_code(), } } } diff --git a/src/meta-client/src/mocks.rs b/src/meta-client/src/mocks.rs index 168c382ce6..943b9d579d 100644 --- a/src/meta-client/src/mocks.rs +++ b/src/meta-client/src/mocks.rs @@ -39,6 +39,7 @@ pub async fn mock_client_by(mock_info: MockInfo) -> MetaClient { let MockInfo { server_addr, channel_manager, + .. } = mock_info; let id = (1000u64, 2000u64); diff --git a/src/meta-client/src/rpc.rs b/src/meta-client/src/rpc.rs index 4335f7217d..1a1935b021 100644 --- a/src/meta-client/src/rpc.rs +++ b/src/meta-client/src/rpc.rs @@ -13,20 +13,9 @@ // limitations under the License. pub mod lock; -pub mod router; mod store; -pub mod util; -use std::fmt::{Display, Formatter}; - -use api::v1::meta::{ - KeyValue as PbKeyValue, Peer as PbPeer, ResponseHeader as PbResponseHeader, - TableName as PbTableName, -}; -pub use router::{ - CreateRequest, Partition, Region, RouteRequest, RouteResponse, Table, TableRoute, -}; -use serde::{Deserialize, Serialize}; +use api::v1::meta::{KeyValue as PbKeyValue, ResponseHeader as PbResponseHeader}; pub use store::{ BatchDeleteRequest, BatchDeleteResponse, BatchGetRequest, BatchGetResponse, BatchPutRequest, BatchPutResponse, CompareAndPutRequest, CompareAndPutResponse, DeleteRangeRequest, @@ -100,81 +89,6 @@ impl KeyValue { } } -#[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)] -pub struct TableName { - pub catalog_name: String, - pub schema_name: String, - pub table_name: String, -} - -impl Display for TableName { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{}.{}.{}", - self.catalog_name, self.schema_name, self.table_name - ) - } -} - -impl TableName { - pub fn new( - catalog_name: impl Into, - schema_name: impl Into, - table_name: impl Into, - ) -> Self { - Self { - catalog_name: catalog_name.into(), - schema_name: schema_name.into(), - table_name: table_name.into(), - } - } -} - -impl From for PbTableName { - fn from(tb: TableName) -> Self { - Self { - catalog_name: tb.catalog_name, - schema_name: tb.schema_name, - table_name: tb.table_name, - } - } -} - -impl From for TableName { - fn from(tb: PbTableName) -> Self { - Self { - catalog_name: tb.catalog_name, - schema_name: tb.schema_name, - table_name: tb.table_name, - } - } -} - -#[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)] -pub struct Peer { - pub id: u64, - pub addr: String, -} - -impl From for Peer { - fn from(p: PbPeer) -> Self { - Self { - id: p.id, - addr: p.addr, - } - } -} - -impl Peer { - pub fn new(id: u64, addr: impl Into) -> Self { - Self { - id, - addr: addr.into(), - } - } -} - #[cfg(test)] mod tests { use api::v1::meta::{Error, ResponseHeader as PbResponseHeader}; diff --git a/src/meta-client/src/rpc/store.rs b/src/meta-client/src/rpc/store.rs index f3e8496d98..a69ec24628 100644 --- a/src/meta-client/src/rpc/store.rs +++ b/src/meta-client/src/rpc/store.rs @@ -21,12 +21,14 @@ use api::v1::meta::{ DeleteRangeResponse as PbDeleteRangeResponse, KeyValue as PbKeyValue, MoveValueRequest as PbMoveValueRequest, MoveValueResponse as PbMoveValueResponse, PutRequest as PbPutRequest, PutResponse as PbPutResponse, RangeRequest as PbRangeRequest, - RangeResponse as PbRangeResponse, + RangeResponse as PbRangeResponse, ResponseHeader as PbResponseHeader, }; +use common_meta::util; +use snafu::ResultExt; use crate::error; -use crate::error::Result; -use crate::rpc::{util, KeyValue, ResponseHeader}; +use crate::error::{InvalidResponseHeaderSnafu, Result}; +use crate::rpc::{KeyValue, ResponseHeader}; #[derive(Debug, Clone, Default)] pub struct RangeRequest { @@ -126,7 +128,7 @@ impl TryFrom for RangeResponse { type Error = error::Error; fn try_from(pb: PbRangeResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -218,7 +220,7 @@ impl TryFrom for PutResponse { type Error = error::Error; fn try_from(pb: PbPutResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -280,7 +282,7 @@ impl TryFrom for BatchGetResponse { type Error = error::Error; fn try_from(pb: PbBatchGetResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self(pb)) } @@ -355,7 +357,7 @@ impl TryFrom for BatchPutResponse { type Error = error::Error; fn try_from(pb: PbBatchPutResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -427,7 +429,7 @@ impl TryFrom for BatchDeleteResponse { type Error = error::Error; fn try_from(pb: PbBatchDeleteResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -511,7 +513,7 @@ impl TryFrom for CompareAndPutResponse { type Error = error::Error; fn try_from(pb: PbCompareAndPutResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -631,7 +633,7 @@ impl TryFrom for DeleteRangeResponse { type Error = error::Error; fn try_from(pb: PbDeleteRangeResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -695,7 +697,7 @@ impl TryFrom for MoveValueResponse { type Error = error::Error; fn try_from(pb: PbMoveValueResponse) -> Result { - util::check_response_header(pb.header.as_ref())?; + check_response_header(pb.header.as_ref())?; Ok(Self::new(pb)) } @@ -718,6 +720,10 @@ impl MoveValueResponse { } } +fn check_response_header(header: Option<&PbResponseHeader>) -> Result<()> { + util::check_response_header(header).context(InvalidResponseHeaderSnafu) +} + #[cfg(test)] mod tests { use api::v1::meta::{ diff --git a/src/meta-srv/Cargo.toml b/src/meta-srv/Cargo.toml index 9234367db3..77679b943c 100644 --- a/src/meta-srv/Cargo.toml +++ b/src/meta-srv/Cargo.toml @@ -43,9 +43,13 @@ tokio.workspace = true tokio-stream = { version = "0.1", features = ["net"] } tonic.workspace = true tower = "0.4" +typetag = "0.2" url = "2.3" servers = { path = "../servers" } [dev-dependencies] +chrono.workspace = true +common-procedure-test = { path = "../common/procedure-test" } +datatypes = { path = "../datatypes" } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/src/meta-srv/src/bootstrap.rs b/src/meta-srv/src/bootstrap.rs index 0fdf5d54cf..33018f81aa 100644 --- a/src/meta-srv/src/bootstrap.rs +++ b/src/meta-srv/src/bootstrap.rs @@ -181,7 +181,7 @@ pub async fn build_meta_srv(opts: &MetaSrvOptions) -> Result { SelectorType::LeaseBased => Arc::new(LeaseBasedSelector) as SelectorRef, }; - let meta_srv = MetaSrvBuilder::new() + MetaSrvBuilder::new() .options(opts.clone()) .kv_store(kv_store) .in_memory(in_memory) @@ -190,9 +190,7 @@ pub async fn build_meta_srv(opts: &MetaSrvOptions) -> Result { .meta_peer_client(meta_peer_client) .lock(lock) .build() - .await; - - Ok(meta_srv) + .await } pub async fn make_meta_srv(opts: &MetaSrvOptions) -> Result { diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 1ea7c584bc..743413041c 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -151,6 +151,13 @@ pub enum Error { #[snafu(display("Table route not found: {}", key))] TableRouteNotFound { key: String, location: Location }, + #[snafu(display("Table route corrupted, key: {}, reason: {}", key, reason))] + CorruptedTableRoute { + key: String, + reason: String, + location: Location, + }, + #[snafu(display("Failed to get sequence: {}", err_msg))] NextSequence { err_msg: String, location: Location }, @@ -324,6 +331,43 @@ pub enum Error { #[snafu(display("Missing request header"))] MissingRequestHeader { location: Location }, + + #[snafu(display( + "Failed to register procedure loader, type name: {}, source: {}", + type_name, + source + ))] + RegisterProcedureLoader { + type_name: String, + #[snafu(backtrace)] + source: common_procedure::error::Error, + }, + + #[snafu(display("Failed to find failover candidates for region: {}", failed_region))] + RegionFailoverCandidatesNotFound { + failed_region: String, + location: Location, + }, + + #[snafu(display( + "Received unexpected instruction reply, mailbox message: {}, reason: {}", + mailbox_message, + reason + ))] + UnexpectedInstructionReply { + mailbox_message: String, + reason: String, + location: Location, + }, + + #[snafu(display("Expected to retry later, reason: {}", reason))] + RetryLater { reason: String, location: Location }, + + #[snafu(display("Failed to convert table route, source: {}", source))] + TableRouteConversion { + #[snafu(backtrace)] + source: common_meta::error::Error, + }, } pub type Result = std::result::Result; @@ -370,6 +414,7 @@ impl ErrorExt for Error { | Error::MailboxClosed { .. } | Error::MailboxTimeout { .. } | Error::MailboxReceiver { .. } + | Error::RetryLater { .. } | Error::StartGrpc { .. } => StatusCode::Internal, Error::EmptyKey { .. } | Error::MissingRequiredParameter { .. } @@ -386,12 +431,14 @@ impl ErrorExt for Error { | Error::StatValueFromUtf8 { .. } | Error::UnexceptedSequenceValue { .. } | Error::TableRouteNotFound { .. } + | Error::CorruptedTableRoute { .. } | Error::NextSequence { .. } | Error::SequenceOutOfRange { .. } | Error::MoveValue { .. } | Error::InvalidKvsLength { .. } | Error::InvalidTxnResult { .. } | Error::InvalidUtf8Value { .. } + | Error::UnexpectedInstructionReply { .. } | Error::Unexpected { .. } => StatusCode::Unexpected, Error::TableNotFound { .. } => StatusCode::TableNotFound, Error::InvalidCatalogValue { source, .. } => source.status_code(), @@ -400,6 +447,12 @@ impl ErrorExt for Error { Error::ShutdownServer { source, .. } | Error::StartHttp { source } => { source.status_code() } + + Error::RegionFailoverCandidatesNotFound { .. } => StatusCode::RuntimeResourcesExhausted, + + Error::RegisterProcedureLoader { source, .. } => source.status_code(), + + Error::TableRouteConversion { source } => source.status_code(), } } } diff --git a/src/meta-srv/src/handler.rs b/src/meta-srv/src/handler.rs index 534fa408e5..62efd0692f 100644 --- a/src/meta-srv/src/handler.rs +++ b/src/meta-srv/src/handler.rs @@ -16,14 +16,15 @@ use std::collections::BTreeMap; use std::sync::Arc; use std::time::Duration; +use api::v1::meta::mailbox_message::Payload; use api::v1::meta::{ HeartbeatRequest, HeartbeatResponse, MailboxMessage, RequestHeader, ResponseHeader, Role, PROTOCOL_VERSION, }; pub use check_leader_handler::CheckLeaderHandler; pub use collect_stats_handler::CollectStatsHandler; -use common_meta::instruction::Instruction; -use common_telemetry::{info, warn}; +use common_meta::instruction::{Instruction, InstructionReply}; +use common_telemetry::{debug, info, warn}; use dashmap::DashMap; pub use failure_handler::RegionFailureHandler; pub use keep_lease_handler::KeepLeaseHandler; @@ -31,19 +32,19 @@ use metrics::{decrement_gauge, increment_gauge}; pub use on_leader_start::OnLeaderStartHandler; pub use persist_stats_handler::PersistStatsHandler; pub use response_header_handler::ResponseHeaderHandler; -use snafu::OptionExt; +use snafu::{OptionExt, ResultExt}; use tokio::sync::mpsc::Sender; use tokio::sync::{oneshot, Notify, RwLock}; use self::node_stat::Stat; -use crate::error::{self, Result}; +use crate::error::{self, DeserializeFromJsonSnafu, Result, UnexpectedInstructionReplySnafu}; use crate::metasrv::Context; use crate::metrics::METRIC_META_HEARTBEAT_CONNECTION_NUM; use crate::sequence::Sequence; use crate::service::mailbox::{Channel, Mailbox, MailboxReceiver, MailboxRef, MessageId}; mod check_leader_handler; mod collect_stats_handler; -mod failure_handler; +pub(crate) mod failure_handler; mod keep_lease_handler; pub mod mailbox_handler; pub mod node_stat; @@ -112,35 +113,66 @@ impl Pusher { } } +#[derive(Clone, Default)] +pub struct Pushers(Arc>>); + +impl Pushers { + async fn push(&self, pusher_id: &str, mailbox_message: MailboxMessage) -> Result<()> { + let pushers = self.0.read().await; + let pusher = pushers + .get(pusher_id) + .context(error::PusherNotFoundSnafu { pusher_id })?; + pusher + .push(HeartbeatResponse { + header: Some(pusher.header()), + mailbox_message: Some(mailbox_message), + }) + .await + } + + pub(crate) async fn insert(&self, pusher_id: String, pusher: Pusher) -> Option { + self.0.write().await.insert(pusher_id, pusher) + } + + async fn remove(&self, pusher_id: &str) -> Option { + self.0.write().await.remove(pusher_id) + } +} + #[derive(Clone, Default)] pub struct HeartbeatHandlerGroup { handlers: Arc>>>, - pushers: Arc>>, + pushers: Pushers, } impl HeartbeatHandlerGroup { + pub(crate) fn new(pushers: Pushers) -> Self { + Self { + handlers: Arc::new(RwLock::new(vec![])), + pushers, + } + } + pub async fn add_handler(&self, handler: impl HeartbeatHandler + 'static) { let mut handlers = self.handlers.write().await; handlers.push(Box::new(handler)); } pub async fn register(&self, key: impl AsRef, pusher: Pusher) { - let mut pushers = self.pushers.write().await; let key = key.as_ref(); increment_gauge!(METRIC_META_HEARTBEAT_CONNECTION_NUM, 1.0); info!("Pusher register: {}", key); - pushers.insert(key.into(), pusher); + let _ = self.pushers.insert(key.to_string(), pusher).await; } pub async fn unregister(&self, key: impl AsRef) -> Option { - let mut pushers = self.pushers.write().await; let key = key.as_ref(); decrement_gauge!(METRIC_META_HEARTBEAT_CONNECTION_NUM, 1.0); info!("Pusher unregister: {}", key); - pushers.remove(key) + self.pushers.remove(key).await } - pub fn pushers(&self) -> Arc>> { + pub fn pushers(&self) -> Pushers { self.pushers.clone() } @@ -178,7 +210,7 @@ impl HeartbeatHandlerGroup { } pub struct HeartbeatMailbox { - pushers: Arc>>, + pushers: Pushers, sequence: Sequence, senders: DashMap>>, timeouts: DashMap, @@ -186,10 +218,18 @@ pub struct HeartbeatMailbox { } impl HeartbeatMailbox { - pub fn create( - pushers: Arc>>, - sequence: Sequence, - ) -> MailboxRef { + pub(crate) fn json_reply(msg: &MailboxMessage) -> Result { + let Payload::Json(payload) = + msg.payload + .as_ref() + .with_context(|| UnexpectedInstructionReplySnafu { + mailbox_message: msg.to_string(), + reason: format!("empty payload, msg: {msg:?}"), + })?; + serde_json::from_str(payload).context(DeserializeFromJsonSnafu { input: payload }) + } + + pub fn create(pushers: Pushers, sequence: Sequence) -> MailboxRef { let mailbox = Arc::new(Self::new(pushers, sequence)); let timeout_checker = mailbox.clone(); @@ -200,7 +240,7 @@ impl HeartbeatMailbox { mailbox } - fn new(pushers: Arc>>, sequence: Sequence) -> Self { + fn new(pushers: Pushers, sequence: Sequence) -> Self { Self { pushers, sequence, @@ -264,15 +304,10 @@ impl Mailbox for HeartbeatMailbox { timeout: Duration, ) -> Result { let message_id = self.next_message_id().await?; + msg.id = message_id; - let pusher_id = match ch { - Channel::Datanode(id) => format!("{}-{}", Role::Datanode as i32, id), - Channel::Frontend(id) => format!("{}-{}", Role::Frontend as i32, id), - }; - let pushers = self.pushers.read().await; - let pusher = pushers - .get(&pusher_id) - .context(error::PusherNotFoundSnafu { pusher_id })?; + let pusher_id = ch.pusher_id(); + debug!("Sending mailbox message {msg:?} to {pusher_id}"); let (tx, rx) = oneshot::channel(); self.senders.insert(message_id, tx); @@ -281,19 +316,14 @@ impl Mailbox for HeartbeatMailbox { self.timeouts.insert(message_id, deadline); self.timeout_notify.notify_one(); - let header = pusher.header(); - msg.id = message_id; - let res = HeartbeatResponse { - header: Some(header), - mailbox_message: Some(msg), - }; - - pusher.push(res).await?; + self.pushers.push(&pusher_id, msg).await?; Ok(MailboxReceiver::new(message_id, rx)) } async fn on_recv(&self, id: MessageId, maybe_msg: Result) -> Result<()> { + debug!("Received mailbox message {maybe_msg:?}"); + self.timeouts.remove(&id); if let Some((_, tx)) = self.senders.remove(&id) { diff --git a/src/meta-srv/src/handler/failure_handler.rs b/src/meta-srv/src/handler/failure_handler.rs index 67b05ad62a..f163714534 100644 --- a/src/meta-srv/src/handler/failure_handler.rs +++ b/src/meta-srv/src/handler/failure_handler.rs @@ -14,28 +14,20 @@ mod runner; +use std::sync::Arc; + use api::v1::meta::{HeartbeatRequest, Role}; use async_trait::async_trait; +use common_catalog::consts::MITO_ENGINE; +use common_meta::RegionIdent; use crate::error::Result; use crate::handler::failure_handler::runner::{FailureDetectControl, FailureDetectRunner}; use crate::handler::{HeartbeatAccumulator, HeartbeatHandler}; use crate::metasrv::{Context, ElectionRef}; +use crate::procedure::region_failover::RegionFailoverManager; -#[derive(Eq, Hash, PartialEq, Clone)] -pub(crate) struct RegionIdent { - catalog: String, - schema: String, - table: String, - region_id: u64, -} - -// TODO(LFC): TBC pub(crate) struct DatanodeHeartbeat { - #[allow(dead_code)] - cluster_id: u64, - #[allow(dead_code)] - node_id: u64, region_idents: Vec, heartbeat_time: i64, } @@ -45,14 +37,18 @@ pub struct RegionFailureHandler { } impl RegionFailureHandler { - pub fn new(election: Option) -> Self { - Self { - failure_detect_runner: FailureDetectRunner::new(election), - } - } + pub(crate) async fn try_new( + election: Option, + region_failover_manager: Arc, + ) -> Result { + region_failover_manager.try_start()?; - pub async fn start(&mut self) { - self.failure_detect_runner.start().await; + let mut failure_detect_runner = FailureDetectRunner::new(election, region_failover_manager); + failure_detect_runner.start().await; + + Ok(Self { + failure_detect_runner, + }) } } @@ -76,9 +72,13 @@ impl HeartbeatHandler for RegionFailureHandler { let Some(stat) = acc.stat.as_ref() else { return Ok(()) }; + // TODO(LFC): Filter out the stalled heartbeats: + // After the region failover is done, the distribution of region is changed. + // We can compare the heartbeat info here with the global region placement metadata, + // and remove the incorrect region ident keys in failure detect runner + // (by sending a control message). + let heartbeat = DatanodeHeartbeat { - cluster_id: stat.cluster_id, - node_id: stat.id, region_idents: stat .region_stats .iter() @@ -86,7 +86,13 @@ impl HeartbeatHandler for RegionFailureHandler { catalog: x.catalog.clone(), schema: x.schema.clone(), table: x.table.clone(), - region_id: x.id, + cluster_id: stat.cluster_id, + datanode_id: stat.id, + // TODO(#1566): Use the real table id. + table_id: 0, + // TODO(#1583): Use the actual table engine. + engine: MITO_ENGINE.to_string(), + region_number: x.id as u32, }) .collect(), heartbeat_time: stat.timestamp_millis, @@ -102,16 +108,19 @@ mod tests { use super::*; use crate::handler::node_stat::{RegionStat, Stat}; use crate::metasrv::builder::MetaSrvBuilder; + use crate::test_util::create_region_failover_manager; #[tokio::test(flavor = "multi_thread")] async fn test_handle_heartbeat() { - let mut handler = RegionFailureHandler::new(None); - handler.start().await; + let region_failover_manager = create_region_failover_manager(); + let handler = RegionFailureHandler::try_new(None, region_failover_manager) + .await + .unwrap(); let req = &HeartbeatRequest::default(); let builder = MetaSrvBuilder::new(); - let metasrv = builder.build().await; + let metasrv = builder.build().await.unwrap(); let mut ctx = metasrv.new_ctx(); ctx.is_infancy = false; diff --git a/src/meta-srv/src/handler/failure_handler/runner.rs b/src/meta-srv/src/handler/failure_handler/runner.rs index 74922fa765..34c764465c 100644 --- a/src/meta-srv/src/handler/failure_handler/runner.rs +++ b/src/meta-srv/src/handler/failure_handler/runner.rs @@ -16,7 +16,8 @@ use std::ops::DerefMut; use std::sync::Arc; use std::time::{Duration, Instant}; -use common_telemetry::{error, warn}; +use common_meta::RegionIdent; +use common_telemetry::{error, info, warn}; use common_time::util::current_time_millis; use dashmap::mapref::multiple::RefMulti; use dashmap::DashMap; @@ -25,8 +26,9 @@ use tokio::sync::mpsc::{Receiver, Sender}; use tokio::task::JoinHandle; use crate::failure_detector::PhiAccrualFailureDetector; -use crate::handler::failure_handler::{DatanodeHeartbeat, RegionIdent}; +use crate::handler::failure_handler::DatanodeHeartbeat; use crate::metasrv::ElectionRef; +use crate::procedure::region_failover::RegionFailoverManager; pub(crate) enum FailureDetectControl { Purge, @@ -37,6 +39,7 @@ pub(crate) enum FailureDetectControl { pub(crate) struct FailureDetectRunner { election: Option, + region_failover_manager: Arc, heartbeat_tx: Sender, heartbeat_rx: Option>, @@ -49,11 +52,15 @@ pub(crate) struct FailureDetectRunner { } impl FailureDetectRunner { - pub(crate) fn new(election: Option) -> Self { + pub(super) fn new( + election: Option, + region_failover_manager: Arc, + ) -> Self { let (heartbeat_tx, heartbeat_rx) = mpsc::channel::(1024); let (control_tx, control_rx) = mpsc::channel::(1024); Self { election, + region_failover_manager, heartbeat_tx, heartbeat_rx: Some(heartbeat_rx), control_tx, @@ -121,16 +128,38 @@ impl FailureDetectRunner { self.receiver_handle = Some(receiver_handle); let election = self.election.clone(); + let region_failover_manager = self.region_failover_manager.clone(); let runner_handle = common_runtime::spawn_bg(async move { loop { let start = Instant::now(); let is_leader = election.as_ref().map(|x| x.is_leader()).unwrap_or(true); if is_leader { - for e in failure_detectors.iter() { - if e.failure_detector().is_available(current_time_millis()) { - // TODO(LFC): TBC - } + let failed_regions = failure_detectors + .iter() + .filter_map(|e| { + // Intentionally not place `current_time_millis()` out of the iteration. + // The failure detection determination should be happened "just in time", + // i.e., failed or not has to be compared with the most recent "now". + // Besides, it might reduce the false positive of failure detection, + // because during the iteration, heartbeats are coming in as usual, + // and the `phi`s are still updating. + if !e.failure_detector().is_available(current_time_millis()) { + Some(e.region_ident().clone()) + } else { + None + } + }) + .collect::>(); + + for r in failed_regions { + // Now that we know the region is failed, remove it from the failure + // detectors, avoiding the failover procedure to be triggered again. + // If the region is back alive (the failover procedure runs successfully), + // it will be added back to the failure detectors again. + failure_detectors.remove(&r); + + region_failover_manager.fire_region_failover(r) } } @@ -143,15 +172,6 @@ impl FailureDetectRunner { self.runner_handle = Some(runner_handle); } - #[cfg(test)] - fn abort(&mut self) { - let Some(handle) = self.receiver_handle.take() else { return }; - handle.abort(); - - let Some(handle) = self.runner_handle.take() else { return }; - handle.abort(); - } - #[cfg(test)] pub(crate) async fn dump(&self) -> FailureDetectorContainer { let (tx, rx) = tokio::sync::oneshot::channel(); @@ -160,11 +180,29 @@ impl FailureDetectRunner { } } +impl Drop for FailureDetectRunner { + fn drop(&mut self) { + if let Some(handle) = self.receiver_handle.take() { + handle.abort(); + info!("Heartbeat receiver in FailureDetectRunner is stopped."); + } + + if let Some(handle) = self.runner_handle.take() { + handle.abort(); + info!("Failure detector in FailureDetectRunner is stopped."); + } + } +} + pub(crate) struct FailureDetectorEntry<'a> { e: RefMulti<'a, RegionIdent, PhiAccrualFailureDetector>, } impl FailureDetectorEntry<'_> { + fn region_ident(&self) -> &RegionIdent { + self.e.key() + } + fn failure_detector(&self) -> &PhiAccrualFailureDetector { self.e.value() } @@ -186,6 +224,10 @@ impl FailureDetectorContainer { Box::new(self.0.iter().map(move |e| FailureDetectorEntry { e })) as _ } + fn remove(&self, ident: &RegionIdent) { + let _ = self.0.remove(ident); + } + fn clear(&self) { self.0.clear() } @@ -200,9 +242,11 @@ impl FailureDetectorContainer { #[cfg(test)] mod tests { + use common_catalog::consts::MITO_ENGINE; use rand::Rng; use super::*; + use crate::test_util::create_region_failover_manager; #[test] fn test_default_failure_detector_container() { @@ -211,7 +255,11 @@ mod tests { catalog: "a".to_string(), schema: "b".to_string(), table: "c".to_string(), - region_id: 1, + cluster_id: 3, + datanode_id: 2, + table_id: 1, + engine: MITO_ENGINE.to_string(), + region_number: 1, }; let _ = container.get_failure_detector(ident.clone()); assert!(container.0.contains_key(&ident)); @@ -234,11 +282,16 @@ mod tests { catalog: "a".to_string(), schema: "b".to_string(), table: "c".to_string(), - region_id: 1, + cluster_id: 3, + datanode_id: 2, + table_id: 1, + engine: MITO_ENGINE.to_string(), + region_number: 1, }; container.get_failure_detector(ident.clone()); - let mut runner = FailureDetectRunner::new(None); + let region_failover_manager = create_region_failover_manager(); + let mut runner = FailureDetectRunner::new(None, region_failover_manager); runner.start_with(Arc::new(container)).await; let dump = runner.dump().await; @@ -248,30 +301,31 @@ mod tests { let dump = runner.dump().await; assert_eq!(dump.iter().collect::>().len(), 0); - - runner.abort(); } #[tokio::test(flavor = "multi_thread")] async fn test_heartbeat() { - let mut runner = FailureDetectRunner::new(None); + let region_failover_manager = create_region_failover_manager(); + let mut runner = FailureDetectRunner::new(None, region_failover_manager); runner.start().await; // Generate 2000 heartbeats start from now. Heartbeat interval is one second, plus some random millis. - fn generate_heartbeats(node_id: u64, region_ids: Vec) -> Vec { + fn generate_heartbeats(datanode_id: u64, region_ids: Vec) -> Vec { let mut rng = rand::thread_rng(); let start = current_time_millis(); (0..2000) .map(|i| DatanodeHeartbeat { - cluster_id: 1, - node_id, region_idents: region_ids .iter() - .map(|®ion_id| RegionIdent { + .map(|®ion_number| RegionIdent { catalog: "a".to_string(), schema: "b".to_string(), table: "c".to_string(), - region_id, + cluster_id: 1, + datanode_id, + table_id: 0, + engine: MITO_ENGINE.to_string(), + region_number, }) .collect(), heartbeat_time: start + i * 1000 + rng.gen_range(0..100), @@ -307,7 +361,5 @@ mod tests { let now = start + acceptable_heartbeat_pause_millis + 2000; assert!(fd.phi(now) > fd.threshold() as _); }); - - runner.abort(); } } diff --git a/src/meta-srv/src/handler/persist_stats_handler.rs b/src/meta-srv/src/handler/persist_stats_handler.rs index 7cc30363d0..09751c32ee 100644 --- a/src/meta-srv/src/handler/persist_stats_handler.rs +++ b/src/meta-srv/src/handler/persist_stats_handler.rs @@ -77,7 +77,7 @@ mod tests { use api::v1::meta::RangeRequest; use super::*; - use crate::handler::HeartbeatMailbox; + use crate::handler::{HeartbeatMailbox, Pushers}; use crate::keys::StatKey; use crate::sequence::Sequence; use crate::service::store::memory::MemStore; @@ -87,18 +87,14 @@ mod tests { let in_memory = Arc::new(MemStore::new()); let kv_store = Arc::new(MemStore::new()); let seq = Sequence::new("test_seq", 0, 10, kv_store.clone()); - let mailbox = HeartbeatMailbox::create(Arc::new(Default::default()), seq); + let mailbox = HeartbeatMailbox::create(Pushers::default(), seq); let mut ctx = Context { - datanode_lease_secs: 30, server_addr: "127.0.0.1:0000".to_string(), in_memory, kv_store, mailbox, election: None, skip_all: Arc::new(AtomicBool::new(false)), - catalog: None, - schema: None, - table: None, is_infancy: false, }; diff --git a/src/meta-srv/src/handler/response_header_handler.rs b/src/meta-srv/src/handler/response_header_handler.rs index 1325cba36b..305e40a5e2 100644 --- a/src/meta-srv/src/handler/response_header_handler.rs +++ b/src/meta-srv/src/handler/response_header_handler.rs @@ -53,7 +53,7 @@ mod tests { use api::v1::meta::{HeartbeatResponse, RequestHeader}; use super::*; - use crate::handler::{Context, HeartbeatMailbox}; + use crate::handler::{Context, HeartbeatMailbox, Pushers}; use crate::sequence::Sequence; use crate::service::store::memory::MemStore; @@ -62,18 +62,14 @@ mod tests { let in_memory = Arc::new(MemStore::new()); let kv_store = Arc::new(MemStore::new()); let seq = Sequence::new("test_seq", 0, 10, kv_store.clone()); - let mailbox = HeartbeatMailbox::create(Arc::new(Default::default()), seq); + let mailbox = HeartbeatMailbox::create(Pushers::default(), seq); let mut ctx = Context { - datanode_lease_secs: 30, server_addr: "127.0.0.1:0000".to_string(), in_memory, kv_store, mailbox, election: None, skip_all: Arc::new(AtomicBool::new(false)), - catalog: None, - schema: None, - table: None, is_infancy: false, }; diff --git a/src/meta-srv/src/lib.rs b/src/meta-srv/src/lib.rs index 50d8a3bd14..01648f3429 100644 --- a/src/meta-srv/src/lib.rs +++ b/src/meta-srv/src/lib.rs @@ -29,10 +29,15 @@ pub mod metasrv; mod metrics; #[cfg(feature = "mock")] pub mod mocks; -mod procedure; +pub mod procedure; pub mod selector; mod sequence; pub mod service; +pub mod table_routes; + +#[cfg(test)] +mod test_util; + pub mod util; pub use crate::error::Result; diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs index 4c9fe18d3f..66635b27f8 100644 --- a/src/meta-srv/src/metasrv.rs +++ b/src/meta-srv/src/metasrv.rs @@ -70,16 +70,12 @@ impl Default for MetaSrvOptions { #[derive(Clone)] pub struct Context { - pub datanode_lease_secs: i64, pub server_addr: String, pub in_memory: ResettableKvStoreRef, pub kv_store: KvStoreRef, pub mailbox: MailboxRef, pub election: Option, pub skip_all: Arc, - pub catalog: Option, - pub schema: Option, - pub table: Option, pub is_infancy: bool, } @@ -99,7 +95,16 @@ impl Context { pub struct LeaderValue(pub String); -pub type SelectorRef = Arc>>; +#[derive(Clone)] +pub struct SelectorContext { + pub datanode_lease_secs: i64, + pub server_addr: String, + pub kv_store: KvStoreRef, + pub catalog: Option, + pub schema: Option, +} + +pub type SelectorRef = Arc>>; pub type ElectionRef = Arc>; #[derive(Clone)] @@ -248,9 +253,12 @@ impl MetaSrv { self.mailbox.clone() } + pub fn procedure_manager(&self) -> &ProcedureManagerRef { + &self.procedure_manager + } + #[inline] pub fn new_ctx(&self) -> Context { - let datanode_lease_secs = self.options().datanode_lease_secs; let server_addr = self.options().server_addr.clone(); let in_memory = self.in_memory(); let kv_store = self.kv_store(); @@ -258,16 +266,12 @@ impl MetaSrv { let election = self.election(); let skip_all = Arc::new(AtomicBool::new(false)); Context { - datanode_lease_secs, server_addr, in_memory, kv_store, mailbox, election, skip_all, - catalog: None, - schema: None, - table: None, is_infancy: false, } } diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs index 8e84b3d023..ac41100545 100644 --- a/src/meta-srv/src/metasrv/builder.rs +++ b/src/meta-srv/src/metasrv/builder.rs @@ -18,15 +18,19 @@ use std::sync::Arc; use common_procedure::local::{LocalManager, ManagerConfig}; use crate::cluster::MetaPeerClient; +use crate::error::Result; use crate::handler::mailbox_handler::MailboxHandler; use crate::handler::{ CheckLeaderHandler, CollectStatsHandler, HeartbeatHandlerGroup, HeartbeatMailbox, - KeepLeaseHandler, OnLeaderStartHandler, PersistStatsHandler, RegionFailureHandler, + KeepLeaseHandler, OnLeaderStartHandler, PersistStatsHandler, Pushers, RegionFailureHandler, ResponseHeaderHandler, }; use crate::lock::DistLockRef; use crate::metadata_service::{DefaultMetadataService, MetadataServiceRef}; -use crate::metasrv::{ElectionRef, MetaSrv, MetaSrvOptions, SelectorRef, TABLE_ID_SEQ}; +use crate::metasrv::{ + ElectionRef, MetaSrv, MetaSrvOptions, SelectorContext, SelectorRef, TABLE_ID_SEQ, +}; +use crate::procedure::region_failover::RegionFailoverManager; use crate::procedure::state_store::MetaStateStore; use crate::selector::lease_based::LeaseBasedSelector; use crate::sequence::Sequence; @@ -106,7 +110,7 @@ impl MetaSrvBuilder { self } - pub async fn build(self) -> MetaSrv { + pub async fn build(self) -> Result { let started = Arc::new(AtomicBool::new(false)); let MetaSrvBuilder { @@ -129,13 +133,34 @@ impl MetaSrvBuilder { let selector = selector.unwrap_or_else(|| Arc::new(LeaseBasedSelector)); + let pushers = Pushers::default(); + let mailbox_sequence = Sequence::new("heartbeat_mailbox", 1, 100, kv_store.clone()); + let mailbox = HeartbeatMailbox::create(pushers.clone(), mailbox_sequence); + + let state_store = Arc::new(MetaStateStore::new(kv_store.clone())); + let procedure_manager = Arc::new(LocalManager::new(ManagerConfig::default(), state_store)); + let handler_group = match handler_group { Some(handler_group) => handler_group, None => { - let mut region_failure_handler = RegionFailureHandler::new(election.clone()); - region_failure_handler.start().await; + let region_failover_manager = Arc::new(RegionFailoverManager::new( + mailbox.clone(), + procedure_manager.clone(), + selector.clone(), + SelectorContext { + server_addr: options.server_addr.clone(), + datanode_lease_secs: options.datanode_lease_secs, + kv_store: kv_store.clone(), + catalog: None, + schema: None, + }, + )); - let group = HeartbeatHandlerGroup::default(); + let region_failure_handler = + RegionFailureHandler::try_new(election.clone(), region_failover_manager) + .await?; + + let group = HeartbeatHandlerGroup::new(pushers); let keep_lease_handler = KeepLeaseHandler::new(kv_store.clone()); group.add_handler(ResponseHeaderHandler::default()).await; // `KeepLeaseHandler` should preferably be in front of `CheckLeaderHandler`, @@ -154,17 +179,10 @@ impl MetaSrvBuilder { let table_id_sequence = Arc::new(Sequence::new(TABLE_ID_SEQ, 1024, 10, kv_store.clone())); - let config = ManagerConfig::default(); - let state_store = Arc::new(MetaStateStore::new(kv_store.clone())); - let procedure_manager = Arc::new(LocalManager::new(config, state_store)); - let metadata_service = metadata_service .unwrap_or_else(|| Arc::new(DefaultMetadataService::new(kv_store.clone()))); - let mailbox_sequence = Sequence::new("heartbeat_mailbox", 1, 100, kv_store.clone()); - let mailbox = HeartbeatMailbox::create(handler_group.pushers(), mailbox_sequence); - - MetaSrv { + Ok(MetaSrv { started, options, in_memory, @@ -178,7 +196,7 @@ impl MetaSrvBuilder { procedure_manager, metadata_service, mailbox, - } + }) } } diff --git a/src/meta-srv/src/mocks.rs b/src/meta-srv/src/mocks.rs index 0381d29032..2b3ee8012e 100644 --- a/src/meta-srv/src/mocks.rs +++ b/src/meta-srv/src/mocks.rs @@ -24,7 +24,7 @@ use tower::service_fn; use crate::metadata_service::{DefaultMetadataService, MetadataService}; use crate::metasrv::builder::MetaSrvBuilder; -use crate::metasrv::{MetaSrvOptions, SelectorRef}; +use crate::metasrv::{MetaSrv, MetaSrvOptions, SelectorRef}; use crate::service::store::etcd::EtcdStore; use crate::service::store::kv::KvStoreRef; use crate::service::store::memory::MemStore; @@ -33,6 +33,7 @@ use crate::service::store::memory::MemStore; pub struct MockInfo { pub server_addr: String, pub channel_manager: ChannelManager, + pub meta_srv: MetaSrv, } pub async fn mock_with_memstore() -> MockInfo { @@ -71,14 +72,16 @@ pub async fn mock( None => builder, }; - let meta_srv = builder.build().await; + let meta_srv = builder.build().await.unwrap(); + meta_srv.try_start().await.unwrap(); let (client, server) = tokio::io::duplex(1024); + let service = meta_srv.clone(); tokio::spawn(async move { tonic::transport::Server::builder() - .add_service(HeartbeatServer::new(meta_srv.clone())) - .add_service(RouterServer::new(meta_srv.clone())) - .add_service(StoreServer::new(meta_srv.clone())) + .add_service(HeartbeatServer::new(service.clone())) + .add_service(RouterServer::new(service.clone())) + .add_service(StoreServer::new(service.clone())) .serve_with_incoming(futures::stream::iter(vec![Ok::<_, std::io::Error>(server)])) .await }); @@ -114,5 +117,6 @@ pub async fn mock( MockInfo { server_addr, channel_manager, + meta_srv, } } diff --git a/src/meta-srv/src/procedure.rs b/src/meta-srv/src/procedure.rs index 0439dacd75..a7c4bc73b2 100644 --- a/src/meta-srv/src/procedure.rs +++ b/src/meta-srv/src/procedure.rs @@ -12,4 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod region_failover; pub(crate) mod state_store; diff --git a/src/meta-srv/src/procedure/region_failover.rs b/src/meta-srv/src/procedure/region_failover.rs new file mode 100644 index 0000000000..49fb17c9ec --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover.rs @@ -0,0 +1,622 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod activate_region; +mod deactivate_region; +mod failover_end; +mod failover_start; +mod update_metadata; + +use std::collections::HashSet; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +use async_trait::async_trait; +use common_meta::RegionIdent; +use common_procedure::error::{ + Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu, +}; +use common_procedure::{ + watcher, Context as ProcedureContext, LockKey, Procedure, ProcedureManagerRef, ProcedureWithId, + Status, +}; +use common_telemetry::{error, info, warn}; +use failover_start::RegionFailoverStart; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; + +use crate::error::{Error, RegisterProcedureLoaderSnafu, Result}; +use crate::metasrv::{SelectorContext, SelectorRef}; +use crate::service::mailbox::MailboxRef; + +const OPEN_REGION_MESSAGE_TIMEOUT: Duration = Duration::from_secs(30); +const CLOSE_REGION_MESSAGE_TIMEOUT: Duration = Duration::from_secs(2); + +pub(crate) struct RegionFailoverManager { + mailbox: MailboxRef, + procedure_manager: ProcedureManagerRef, + selector: SelectorRef, + selector_ctx: SelectorContext, + running_procedures: Arc>>, +} + +struct FailoverProcedureGuard<'a> { + running_procedures: Arc>>, + failed_region: &'a RegionIdent, +} + +impl Drop for FailoverProcedureGuard<'_> { + fn drop(&mut self) { + self.running_procedures + .lock() + .unwrap() + .remove(self.failed_region); + } +} + +impl RegionFailoverManager { + pub(crate) fn new( + mailbox: MailboxRef, + procedure_manager: ProcedureManagerRef, + selector: SelectorRef, + selector_ctx: SelectorContext, + ) -> Self { + Self { + mailbox, + procedure_manager, + selector, + selector_ctx, + running_procedures: Arc::new(Mutex::new(HashSet::new())), + } + } + + pub(crate) fn try_start(&self) -> Result<()> { + let mailbox = self.mailbox.clone(); + let selector = self.selector.clone(); + let selector_ctx = self.selector_ctx.clone(); + self.procedure_manager + .register_loader( + RegionFailoverProcedure::TYPE_NAME, + Box::new(move |json| { + RegionFailoverProcedure::from_json( + json, + RegionFailoverContext { + mailbox: mailbox.clone(), + selector: selector.clone(), + selector_ctx: selector_ctx.clone(), + }, + ) + .map(|p| Box::new(p) as _) + }), + ) + .context(RegisterProcedureLoaderSnafu { + type_name: RegionFailoverProcedure::TYPE_NAME, + }) + } + + fn insert_running_procedures(&self, failed_region: &RegionIdent) -> bool { + let mut procedures = self.running_procedures.lock().unwrap(); + if procedures.contains(failed_region) { + return false; + } + procedures.insert(failed_region.clone()) + } + + pub(crate) fn fire_region_failover(&self, failed_region: RegionIdent) { + if !self.insert_running_procedures(&failed_region) { + warn!("Region failover procedure for region {failed_region} is already running!"); + return; + } + + let procedure = RegionFailoverProcedure::new( + failed_region.clone(), + RegionFailoverContext { + mailbox: self.mailbox.clone(), + selector: self.selector.clone(), + selector_ctx: self.selector_ctx.clone(), + }, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + let procedure_id = procedure_with_id.id; + info!("Starting region failover procedure {procedure_id} for region {failed_region:?}"); + + let procedure_manager = self.procedure_manager.clone(); + let running_procedures = self.running_procedures.clone(); + common_runtime::spawn_bg(async move { + let _guard = FailoverProcedureGuard { + running_procedures, + failed_region: &failed_region, + }; + + let watcher = &mut match procedure_manager.submit(procedure_with_id).await { + Ok(watcher) => watcher, + Err(e) => { + error!(e; "Failed to submit region failover procedure {procedure_id} for region {failed_region}"); + return; + } + }; + + if let Err(e) = watcher::wait(watcher).await { + error!(e; "Failed to wait region failover procedure {procedure_id} for region {failed_region}"); + return; + } + + info!("Region failover procedure {procedure_id} for region {failed_region} is finished successfully!"); + }); + } +} + +/// A "Node" in the state machine of region failover procedure. +/// Contains the current state and the data. +#[derive(Serialize, Deserialize, Debug)] +struct Node { + failed_region: RegionIdent, + state: Option>, +} + +/// The "Context" of region failover procedure state machine. +#[derive(Clone)] +pub struct RegionFailoverContext { + pub mailbox: MailboxRef, + pub selector: SelectorRef, + pub selector_ctx: SelectorContext, +} + +/// The state machine of region failover procedure. Driven by the call to `next`. +#[async_trait] +#[typetag::serde(tag = "region_failover_state")] +trait State: Sync + Send + Debug { + async fn next( + mut self: Box, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result>; + + fn status(&self) -> Status { + Status::executing(true) + } +} + +/// The states transition of region failover procedure: +/// +/// ```text +/// ┌───────────────────┐ +/// │RegionFailoverStart│ +/// └─────────┬─────────┘ +/// │ +/// │ Selects a candidate(Datanode) +/// ┌─────────┐ │ to place the failed region +/// │ │ │ +/// If replied with │ ┌───▼────▼───────┐ +/// "Close region │ │DeactivateRegion│ +/// failed" │ └───┬────┬───────┘ +/// │ │ │ +/// └─────────┘ │ Sends "Close Region" request +/// │ to the failed Datanode, and +/// ┌─────────┐ │ wait for 2 seconds +/// │ │ │ +/// │ ┌──▼────▼──────┐ +/// Wait candidate │ │ActivateRegion◄───────────────────────┐ +/// response timeout │ └──┬────┬──────┘ │ +/// │ │ │ │ +/// └─────────┘ │ Sends "Open Region" request │ +/// │ to the candidate Datanode, │ +/// │ and wait for 30 seconds │ +/// │ │ +/// │ Check Datanode returns │ +/// │ │ +/// success ├──────────────────────────────┘ +/// │ failed +/// ┌─────────▼──────────┐ +/// │UpdateRegionMetadata│ +/// └─────────┬──────────┘ +/// │ +/// │ Updates the Region +/// │ placement metadata +/// │ +/// ┌────────▼────────┐ +/// │RegionFailoverEnd│ +/// └─────────────────┘ +/// ``` +pub struct RegionFailoverProcedure { + node: Node, + context: RegionFailoverContext, +} + +impl RegionFailoverProcedure { + const TYPE_NAME: &'static str = "metasrv-procedure::RegionFailover"; + + pub fn new(failed_region: RegionIdent, context: RegionFailoverContext) -> Self { + let state = RegionFailoverStart::new(); + let node = Node { + failed_region, + state: Some(Box::new(state)), + }; + Self { node, context } + } + + fn from_json(json: &str, context: RegionFailoverContext) -> ProcedureResult { + let node: Node = serde_json::from_str(json).context(FromJsonSnafu)?; + Ok(Self { node, context }) + } +} + +#[async_trait] +impl Procedure for RegionFailoverProcedure { + fn type_name(&self) -> &str { + Self::TYPE_NAME + } + + async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult { + if let Some(state) = self.node.state.take() { + let next_state = state + .next(&self.context, &self.node.failed_region) + .await + .map_err(|e| { + if matches!(e, Error::RetryLater { .. }) { + ProcedureError::retry_later(e) + } else { + ProcedureError::external(e) + } + })?; + self.node.state = Some(next_state); + } + Ok(self + .node + .state + .as_ref() + .map(|s| s.status()) + .unwrap_or(Status::Done)) + } + + fn dump(&self) -> ProcedureResult { + serde_json::to_string(&self.node).context(ToJsonSnafu) + } + + fn lock_key(&self) -> LockKey { + let region_ident = &self.node.failed_region; + let key = format!( + "{}/region-{}", + common_catalog::format_full_table_name( + ®ion_ident.catalog, + ®ion_ident.schema, + ®ion_ident.table + ), + region_ident.region_number + ); + LockKey::single(key) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use api::v1::meta::mailbox_message::Payload; + use api::v1::meta::{HeartbeatResponse, MailboxMessage, Peer, RequestHeader}; + use catalog::helper::TableGlobalKey; + use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO_ENGINE}; + use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; + use common_meta::DatanodeId; + use common_procedure::BoxedProcedure; + use rand::prelude::SliceRandom; + use tokio::sync::mpsc::Receiver; + + use super::*; + use crate::handler::{HeartbeatMailbox, Pusher, Pushers}; + use crate::selector::{Namespace, Selector}; + use crate::sequence::Sequence; + use crate::service::mailbox::Channel; + use crate::service::store::memory::MemStore; + use crate::table_routes; + + struct RandomNodeSelector { + nodes: Vec, + } + + #[async_trait] + impl Selector for RandomNodeSelector { + type Context = SelectorContext; + type Output = Vec; + + async fn select(&self, _ns: Namespace, _ctx: &Self::Context) -> Result { + let mut rng = rand::thread_rng(); + let mut nodes = self.nodes.clone(); + nodes.shuffle(&mut rng); + Ok(nodes) + } + } + + // The "foreign" means the Datanode is not containing any regions to the table before. + pub struct ForeignNodeSelector { + pub foreign: Peer, + } + + #[async_trait] + impl Selector for ForeignNodeSelector { + type Context = SelectorContext; + type Output = Vec; + + async fn select(&self, _ns: Namespace, _ctx: &Self::Context) -> Result { + Ok(vec![self.foreign.clone()]) + } + } + + pub struct TestingEnv { + pub context: RegionFailoverContext, + pub failed_region: RegionIdent, + pub heartbeat_receivers: HashMap>>, + } + + pub struct TestingEnvBuilder { + selector: Option, + failed_region: Option, + } + + impl TestingEnvBuilder { + pub fn new() -> Self { + Self { + selector: None, + failed_region: None, + } + } + + #[allow(unused)] + pub fn with_selector(mut self, selector: SelectorRef) -> Self { + self.selector = Some(selector); + self + } + + pub fn with_failed_region(mut self, failed_region: u32) -> Self { + self.failed_region = Some(failed_region); + self + } + + pub async fn build(self) -> TestingEnv { + let kv_store = Arc::new(MemStore::new()) as _; + + let table = "my_table"; + let (_, table_global_value) = + table_routes::tests::prepare_table_global_value(&kv_store, table).await; + + table_routes::tests::prepare_table_route_value(&kv_store, table).await; + + let pushers = Pushers::default(); + let mut heartbeat_receivers = HashMap::with_capacity(3); + for datanode_id in 1..=3 { + let (tx, rx) = tokio::sync::mpsc::channel(1); + + let pusher_id = Channel::Datanode(datanode_id).pusher_id(); + let pusher = Pusher::new(tx, &RequestHeader::default()); + let _ = pushers.insert(pusher_id, pusher).await; + + heartbeat_receivers.insert(datanode_id, rx); + } + + let mailbox_sequence = + Sequence::new("test_heartbeat_mailbox", 0, 100, kv_store.clone()); + let mailbox = HeartbeatMailbox::create(pushers, mailbox_sequence); + + let failed_region = self.failed_region.unwrap_or(1); + let failed_datanode = table_global_value + .regions_id_map + .iter() + .find_map(|(datanode_id, regions)| { + if regions.contains(&failed_region) { + Some(*datanode_id) + } else { + None + } + }) + .unwrap(); + let failed_region = RegionIdent { + cluster_id: 0, + datanode_id: failed_datanode, + table_id: 1, + engine: MITO_ENGINE.to_string(), + region_number: failed_region, + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table: table.to_string(), + }; + + let selector = self.selector.unwrap_or_else(|| { + let nodes = (1..=table_global_value.regions_id_map.len()) + .map(|id| Peer { + id: id as u64, + addr: "".to_string(), + }) + .collect(); + Arc::new(RandomNodeSelector { nodes }) + }); + let selector_ctx = SelectorContext { + datanode_lease_secs: 10, + server_addr: "127.0.0.1:3002".to_string(), + kv_store, + catalog: None, + schema: None, + }; + + TestingEnv { + context: RegionFailoverContext { + mailbox, + selector, + selector_ctx, + }, + failed_region, + heartbeat_receivers, + } + } + } + + #[tokio::test] + async fn test_region_failover_procedure() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + mut heartbeat_receivers, + } = TestingEnvBuilder::new().build().await; + + let mut procedure = Box::new(RegionFailoverProcedure::new( + failed_region.clone(), + context.clone(), + )) as BoxedProcedure; + + let mut failed_datanode = heartbeat_receivers + .remove(&failed_region.datanode_id) + .unwrap(); + let mailbox_clone = context.mailbox.clone(); + let failed_region_clone = failed_region.clone(); + common_runtime::spawn_bg(async move { + let resp = failed_datanode.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::CloseRegion(failed_region_clone.clone())) + .unwrap(), + )) + ); + + // simulating response from Datanode + mailbox_clone + .on_recv( + 1, + Ok(MailboxMessage { + id: 1, + subject: "Deactivate Region".to_string(), + from: format!("Datanode-{}", failed_region.datanode_id), + to: "Metasrv".to_string(), + timestamp_millis: common_time::util::current_time_millis(), + payload: Some(Payload::Json( + serde_json::to_string(&InstructionReply::CloseRegion(SimpleReply { + result: true, + error: None, + })) + .unwrap(), + )), + }), + ) + .await + .unwrap(); + }); + + let (candidate_tx, mut candidate_rx) = tokio::sync::mpsc::channel(1); + for (datanode_id, mut recv) in heartbeat_receivers.into_iter() { + let mailbox_clone = context.mailbox.clone(); + let failed_region_clone = failed_region.clone(); + let candidate_tx = candidate_tx.clone(); + common_runtime::spawn_bg(async move { + let resp = recv.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::OpenRegion( + failed_region_clone.clone() + )) + .unwrap(), + )) + ); + + candidate_tx.send(datanode_id).await.unwrap(); + + // simulating response from Datanode + mailbox_clone + .on_recv( + // Very tricky here: + // the procedure only sends two messages in sequence, the second one is + // "Activate Region", and its message id is 2. + 2, + Ok(MailboxMessage { + id: 2, + subject: "Activate Region".to_string(), + from: format!("Datanode-{datanode_id}"), + to: "Metasrv".to_string(), + timestamp_millis: common_time::util::current_time_millis(), + payload: Some(Payload::Json( + serde_json::to_string(&InstructionReply::OpenRegion(SimpleReply { + result: true, + error: None, + })) + .unwrap(), + )), + }), + ) + .await + .unwrap(); + }); + } + + common_procedure_test::execute_procedure_until_done(&mut procedure).await; + + assert_eq!( + procedure.dump().unwrap(), + r#"{"failed_region":{"cluster_id":0,"datanode_id":1,"catalog":"greptime","schema":"public","table":"my_table","table_id":1,"engine":"mito","region_number":1},"state":{"region_failover_state":"RegionFailoverEnd"}}"# + ); + + // Verifies that the failed region (region 1) is moved from failed datanode (datanode 1) to the candidate datanode. + let key = TableGlobalKey { + catalog_name: failed_region.catalog.clone(), + schema_name: failed_region.schema.clone(), + table_name: failed_region.table.clone(), + }; + let value = table_routes::get_table_global_value(&context.selector_ctx.kv_store, &key) + .await + .unwrap() + .unwrap(); + assert_eq!( + value + .regions_id_map + .get(&failed_region.datanode_id) + .unwrap(), + &vec![2] + ); + assert!(value + .regions_id_map + .get(&candidate_rx.recv().await.unwrap()) + .unwrap() + .contains(&1)); + } + + #[tokio::test] + async fn test_state_serde() { + let TestingEnv { + context, + failed_region, + heartbeat_receivers: _, + } = TestingEnvBuilder::new().build().await; + + let state = RegionFailoverStart::new(); + let node = Node { + failed_region, + state: Some(Box::new(state)), + }; + let procedure = RegionFailoverProcedure { node, context }; + + let s = procedure.dump().unwrap(); + assert_eq!( + s, + r#"{"failed_region":{"cluster_id":0,"datanode_id":1,"catalog":"greptime","schema":"public","table":"my_table","table_id":1,"engine":"mito","region_number":1},"state":{"region_failover_state":"RegionFailoverStart","failover_candidate":null}}"# + ); + let n: Node = serde_json::from_str(&s).unwrap(); + assert_eq!( + format!("{n:?}"), + r#"Node { failed_region: RegionIdent { cluster_id: 0, datanode_id: 1, catalog: "greptime", schema: "public", table: "my_table", table_id: 1, engine: "mito", region_number: 1 }, state: Some(RegionFailoverStart { failover_candidate: None }) }"# + ); + } +} diff --git a/src/meta-srv/src/procedure/region_failover/activate_region.rs b/src/meta-srv/src/procedure/region_failover/activate_region.rs new file mode 100644 index 0000000000..761a174918 --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover/activate_region.rs @@ -0,0 +1,241 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use api::v1::meta::MailboxMessage; +use async_trait::async_trait; +use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; +use common_meta::peer::Peer; +use common_meta::RegionIdent; +use common_telemetry::debug; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; + +use super::update_metadata::UpdateRegionMetadata; +use super::{RegionFailoverContext, State}; +use crate::error::{ + Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu, +}; +use crate::handler::HeartbeatMailbox; +use crate::procedure::region_failover::OPEN_REGION_MESSAGE_TIMEOUT; +use crate::service::mailbox::{Channel, MailboxReceiver}; + +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct ActivateRegion { + candidate: Peer, +} + +impl ActivateRegion { + pub(super) fn new(candidate: Peer) -> Self { + Self { candidate } + } + + async fn send_open_region_message( + &self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + timeout: Duration, + ) -> Result { + let instruction = Instruction::OpenRegion(failed_region.clone()); + + let msg = MailboxMessage::json_message( + "Activate Region", + &format!("Metasrv@{}", ctx.selector_ctx.server_addr), + &format!( + "Datanode-(id={}, addr={})", + self.candidate.id, self.candidate.addr + ), + common_time::util::current_time_millis(), + &instruction, + ) + .with_context(|_| SerializeToJsonSnafu { + input: instruction.to_string(), + })?; + + let ch = Channel::Datanode(self.candidate.id); + ctx.mailbox.send(&ch, msg, timeout).await + } + + async fn handle_response( + self, + mailbox_receiver: MailboxReceiver, + failed_region: &RegionIdent, + ) -> Result> { + match mailbox_receiver.await? { + Ok(msg) => { + debug!("Received activate region reply: {msg:?}"); + + let reply = HeartbeatMailbox::json_reply(&msg)?; + let InstructionReply::OpenRegion(SimpleReply { result, error }) = reply else { + return UnexpectedInstructionReplySnafu { + mailbox_message: msg.to_string(), + reason: "expect open region reply", + }.fail(); + }; + if result { + Ok(Box::new(UpdateRegionMetadata::new(self.candidate))) + } else { + // The region could be just indeed cannot be opened by the candidate, retry + // would be in vain. Then why not just end the failover procedure? Because we + // currently lack the methods or any maintenance tools to manage the whole + // procedures things, it would be easier to let the procedure keep running. + let reason = format!( + "Region {failed_region:?} is not opened by Datanode {:?}, error: {error:?}", + self.candidate, + ); + RetryLaterSnafu { reason }.fail() + } + } + Err(e) if matches!(e, Error::MailboxTimeout { .. }) => { + let reason = format!( + "Mailbox received timeout for activate failed region {failed_region:?} on Datanode {:?}", + self.candidate, + ); + RetryLaterSnafu { reason }.fail() + } + Err(e) => Err(e), + } + } +} + +#[async_trait] +#[typetag::serde] +impl State for ActivateRegion { + async fn next( + mut self: Box, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result> { + let mailbox_receiver = self + .send_open_region_message(ctx, failed_region, OPEN_REGION_MESSAGE_TIMEOUT) + .await?; + + self.handle_response(mailbox_receiver, failed_region).await + } +} + +#[cfg(test)] +mod tests { + use api::v1::meta::mailbox_message::Payload; + use common_meta::instruction::SimpleReply; + + use super::super::tests::{TestingEnv, TestingEnvBuilder}; + use super::*; + + #[tokio::test] + async fn test_activate_region_success() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + mut heartbeat_receivers, + } = TestingEnvBuilder::new().build().await; + + let candidate = 2; + let state = ActivateRegion::new(Peer::new(candidate, "")); + let mailbox_receiver = state + .send_open_region_message(&context, &failed_region, Duration::from_millis(100)) + .await + .unwrap(); + + let message_id = mailbox_receiver.message_id(); + + // verify that the open region message is sent + let rx = heartbeat_receivers.get_mut(&candidate).unwrap(); + let resp = rx.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!(received.id, message_id); + assert_eq!(received.subject, "Activate Region"); + assert_eq!(received.from, "Metasrv@127.0.0.1:3002"); + assert_eq!(received.to, "Datanode-(id=2, addr=)"); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::OpenRegion(failed_region.clone())).unwrap(), + )) + ); + + // simulating response from Datanode + context + .mailbox + .on_recv( + message_id, + Ok(MailboxMessage { + id: message_id, + subject: "Activate Region".to_string(), + from: "Datanode-2".to_string(), + to: "Metasrv".to_string(), + timestamp_millis: common_time::util::current_time_millis(), + payload: Some(Payload::Json( + serde_json::to_string(&InstructionReply::OpenRegion(SimpleReply { + result: true, + error: None, + })) + .unwrap(), + )), + }), + ) + .await + .unwrap(); + + let next_state = state + .handle_response(mailbox_receiver, &failed_region) + .await + .unwrap(); + assert_eq!( + format!("{next_state:?}"), + r#"UpdateRegionMetadata { candidate: Peer { id: 2, addr: "" } }"# + ); + } + + #[tokio::test] + async fn test_activate_region_timeout() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + mut heartbeat_receivers, + } = TestingEnvBuilder::new().build().await; + + let candidate = 2; + let state = ActivateRegion::new(Peer::new(candidate, "")); + let mailbox_receiver = state + .send_open_region_message(&context, &failed_region, Duration::from_millis(100)) + .await + .unwrap(); + + // verify that the open region message is sent + let rx = heartbeat_receivers.get_mut(&candidate).unwrap(); + let resp = rx.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!(received.id, mailbox_receiver.message_id()); + assert_eq!(received.subject, "Activate Region"); + assert_eq!(received.from, "Metasrv@127.0.0.1:3002"); + assert_eq!(received.to, "Datanode-(id=2, addr=)"); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::OpenRegion(failed_region.clone())).unwrap() + )) + ); + + let result = state + .handle_response(mailbox_receiver, &failed_region) + .await; + assert!(matches!(result, Err(Error::RetryLater { .. }))); + } +} diff --git a/src/meta-srv/src/procedure/region_failover/deactivate_region.rs b/src/meta-srv/src/procedure/region_failover/deactivate_region.rs new file mode 100644 index 0000000000..49b1095764 --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover/deactivate_region.rs @@ -0,0 +1,253 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::Duration; + +use api::v1::meta::MailboxMessage; +use async_trait::async_trait; +use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; +use common_meta::peer::Peer; +use common_meta::RegionIdent; +use common_telemetry::debug; +use serde::{Deserialize, Serialize}; +use snafu::ResultExt; + +use super::activate_region::ActivateRegion; +use super::{RegionFailoverContext, State}; +use crate::error::{ + Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu, +}; +use crate::handler::HeartbeatMailbox; +use crate::procedure::region_failover::CLOSE_REGION_MESSAGE_TIMEOUT; +use crate::service::mailbox::{Channel, MailboxReceiver}; + +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct DeactivateRegion { + candidate: Peer, +} + +impl DeactivateRegion { + pub(super) fn new(candidate: Peer) -> Self { + Self { candidate } + } + + async fn send_close_region_message( + &self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + timeout: Duration, + ) -> Result { + let instruction = Instruction::CloseRegion(failed_region.clone()); + + let msg = MailboxMessage::json_message( + "Deactivate Region", + &format!("Metasrv@{}", ctx.selector_ctx.server_addr), + &format!("Datanode-{}", failed_region.datanode_id), + common_time::util::current_time_millis(), + &instruction, + ) + .with_context(|_| SerializeToJsonSnafu { + input: instruction.to_string(), + })?; + + let ch = Channel::Datanode(failed_region.datanode_id); + ctx.mailbox.send(&ch, msg, timeout).await + } + + async fn handle_response( + self, + mailbox_receiver: MailboxReceiver, + failed_region: &RegionIdent, + ) -> Result> { + match mailbox_receiver.await? { + Ok(msg) => { + debug!("Received deactivate region reply: {msg:?}"); + + let reply = HeartbeatMailbox::json_reply(&msg)?; + let InstructionReply::CloseRegion(SimpleReply { result, error }) = reply else { + return UnexpectedInstructionReplySnafu { + mailbox_message: msg.to_string(), + reason: "expect close region reply" + }.fail(); + }; + if result { + Ok(Box::new(ActivateRegion::new(self.candidate))) + } else { + // Under rare circumstances would a Datanode fail to close a Region. + // So simply retry. + let reason = format!( + "Region {failed_region:?} is not closed by Datanode {}, error: {error:?}", + failed_region.datanode_id, + ); + RetryLaterSnafu { reason }.fail() + } + } + Err(e) if matches!(e, Error::MailboxTimeout { .. }) => { + // Since we are in a region failover situation, the Datanode that the failed region + // resides might be unreachable. So region deactivation is happened in a "try our + // best" effort, do not retry if mailbox received timeout. + // However, if the region failover procedure is also used in a planned maintenance + // situation in the future, a proper retry is a must. + Ok(Box::new(ActivateRegion::new(self.candidate))) + } + Err(e) => Err(e), + } + } +} + +#[async_trait] +#[typetag::serde] +impl State for DeactivateRegion { + async fn next( + mut self: Box, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result> { + let result = self + .send_close_region_message(ctx, failed_region, CLOSE_REGION_MESSAGE_TIMEOUT) + .await; + let mailbox_receiver = match result { + Ok(mailbox_receiver) => mailbox_receiver, + Err(e) if matches!(e, Error::PusherNotFound { .. }) => { + // The Datanode could be unreachable and deregistered from pushers, + // so simply advancing to the next state here. + return Ok(Box::new(ActivateRegion::new(self.candidate))); + } + Err(e) => return Err(e), + }; + + self.handle_response(mailbox_receiver, failed_region).await + } +} + +#[cfg(test)] +mod tests { + use api::v1::meta::mailbox_message::Payload; + use common_meta::instruction::SimpleReply; + + use super::super::tests::{TestingEnv, TestingEnvBuilder}; + use super::*; + + #[tokio::test] + async fn test_deactivate_region_success() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + mut heartbeat_receivers, + } = TestingEnvBuilder::new().build().await; + + let state = DeactivateRegion::new(Peer::new(2, "")); + let mailbox_receiver = state + .send_close_region_message(&context, &failed_region, Duration::from_millis(100)) + .await + .unwrap(); + + let message_id = mailbox_receiver.message_id(); + + // verify that the close region message is sent + let rx = heartbeat_receivers + .get_mut(&failed_region.datanode_id) + .unwrap(); + let resp = rx.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!(received.id, message_id); + assert_eq!(received.subject, "Deactivate Region"); + assert_eq!(received.from, "Metasrv@127.0.0.1:3002"); + assert_eq!(received.to, "Datanode-1"); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::CloseRegion(failed_region.clone())).unwrap(), + )) + ); + + // simulating response from Datanode + context + .mailbox + .on_recv( + message_id, + Ok(MailboxMessage { + id: message_id, + subject: "Deactivate Region".to_string(), + from: "Datanode-1".to_string(), + to: "Metasrv".to_string(), + timestamp_millis: common_time::util::current_time_millis(), + payload: Some(Payload::Json( + serde_json::to_string(&InstructionReply::CloseRegion(SimpleReply { + result: true, + error: None, + })) + .unwrap(), + )), + }), + ) + .await + .unwrap(); + + let next_state = state + .handle_response(mailbox_receiver, &failed_region) + .await + .unwrap(); + assert_eq!( + format!("{next_state:?}"), + r#"ActivateRegion { candidate: Peer { id: 2, addr: "" } }"# + ); + } + + #[tokio::test] + async fn test_deactivate_region_timeout() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + mut heartbeat_receivers, + } = TestingEnvBuilder::new().build().await; + + let state = DeactivateRegion::new(Peer::new(2, "")); + let mailbox_receiver = state + .send_close_region_message(&context, &failed_region, Duration::from_millis(100)) + .await + .unwrap(); + + // verify that the open region message is sent + let rx = heartbeat_receivers + .get_mut(&failed_region.datanode_id) + .unwrap(); + let resp = rx.recv().await.unwrap().unwrap(); + let received = &resp.mailbox_message.unwrap(); + assert_eq!(received.id, mailbox_receiver.message_id()); + assert_eq!(received.subject, "Deactivate Region"); + assert_eq!(received.from, "Metasrv@127.0.0.1:3002"); + assert_eq!(received.to, "Datanode-1"); + assert_eq!( + received.payload, + Some(Payload::Json( + serde_json::to_string(&Instruction::CloseRegion(failed_region.clone())).unwrap(), + )) + ); + + let next_state = state + .handle_response(mailbox_receiver, &failed_region) + .await + .unwrap(); + // Timeout or not, proceed to `ActivateRegion`. + assert_eq!( + format!("{next_state:?}"), + r#"ActivateRegion { candidate: Peer { id: 2, addr: "" } }"# + ); + } +} diff --git a/src/meta-srv/src/procedure/region_failover/failover_end.rs b/src/meta-srv/src/procedure/region_failover/failover_end.rs new file mode 100644 index 0000000000..221d6b0c78 --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover/failover_end.rs @@ -0,0 +1,40 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; +use common_meta::RegionIdent; +use common_procedure::Status; +use serde::{Deserialize, Serialize}; + +use super::{RegionFailoverContext, State}; +use crate::error::Result; + +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct RegionFailoverEnd; + +#[async_trait] +#[typetag::serde] +impl State for RegionFailoverEnd { + async fn next( + mut self: Box, + _: &RegionFailoverContext, + _: &RegionIdent, + ) -> Result> { + Ok(self) + } + + fn status(&self) -> Status { + Status::Done + } +} diff --git a/src/meta-srv/src/procedure/region_failover/failover_start.rs b/src/meta-srv/src/procedure/region_failover/failover_start.rs new file mode 100644 index 0000000000..f9ecde0e29 --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover/failover_start.rs @@ -0,0 +1,132 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; +use common_error::prelude::{ErrorExt, StatusCode}; +use common_meta::peer::Peer; +use common_meta::RegionIdent; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::ensure; + +use super::deactivate_region::DeactivateRegion; +use super::{RegionFailoverContext, State}; +use crate::error::{RegionFailoverCandidatesNotFoundSnafu, Result, RetryLaterSnafu}; + +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct RegionFailoverStart { + failover_candidate: Option, +} + +impl RegionFailoverStart { + pub(super) fn new() -> Self { + Self { + failover_candidate: None, + } + } + + async fn choose_candidate( + &mut self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result { + if let Some(candidate) = self.failover_candidate.clone() { + return Ok(candidate); + } + + let cluster_id = failed_region.cluster_id; + let candidates = ctx + .selector + .select(cluster_id, &ctx.selector_ctx) + .await? + .iter() + .filter_map(|p| { + if p.id != failed_region.datanode_id { + Some(p.clone().into()) + } else { + None + } + }) + .collect::>(); + ensure!( + !candidates.is_empty(), + RegionFailoverCandidatesNotFoundSnafu { + failed_region: format!("{failed_region:?}"), + } + ); + + // Safety: indexing is guarded by the "ensure!" above. + let candidate = &candidates[0]; + self.failover_candidate = Some(candidate.clone()); + info!("Choose failover candidate datanode {candidate:?} for region: {failed_region}"); + Ok(candidate.clone()) + } +} + +#[async_trait] +#[typetag::serde] +impl State for RegionFailoverStart { + async fn next( + mut self: Box, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result> { + let candidate = self + .choose_candidate(ctx, failed_region) + .await + .map_err(|e| { + if e.status_code() == StatusCode::RuntimeResourcesExhausted { + RetryLaterSnafu { + reason: format!("{e}"), + } + .build() + } else { + e + } + })?; + return Ok(Box::new(DeactivateRegion::new(candidate))); + } +} + +#[cfg(test)] +mod tests { + use super::super::tests::{TestingEnv, TestingEnvBuilder}; + use super::*; + + #[tokio::test] + async fn test_choose_failover_candidate() { + common_telemetry::init_default_ut_logging(); + + let TestingEnv { + context, + failed_region, + heartbeat_receivers: _, + } = TestingEnvBuilder::new().build().await; + + let mut state = RegionFailoverStart::new(); + assert!(state.failover_candidate.is_none()); + + let candidate = state + .choose_candidate(&context, &failed_region) + .await + .unwrap(); + assert_ne!(candidate.id, failed_region.datanode_id); + + let candidate_again = state + .choose_candidate(&context, &failed_region) + .await + .unwrap(); + assert_eq!(candidate, candidate_again); + } +} diff --git a/src/meta-srv/src/procedure/region_failover/update_metadata.rs b/src/meta-srv/src/procedure/region_failover/update_metadata.rs new file mode 100644 index 0000000000..36570cf5fe --- /dev/null +++ b/src/meta-srv/src/procedure/region_failover/update_metadata.rs @@ -0,0 +1,415 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::{TableName, TableRouteValue}; +use async_trait::async_trait; +use catalog::helper::TableGlobalKey; +use common_meta::peer::Peer; +use common_meta::router::TableRoute; +use common_meta::RegionIdent; +use common_telemetry::info; +use serde::{Deserialize, Serialize}; +use snafu::{OptionExt, ResultExt}; + +use super::failover_end::RegionFailoverEnd; +use super::{RegionFailoverContext, State}; +use crate::error::{ + CorruptedTableRouteSnafu, Result, RetryLaterSnafu, TableNotFoundSnafu, + TableRouteConversionSnafu, +}; +use crate::keys::TableRouteKey; +use crate::table_routes; + +#[derive(Serialize, Deserialize, Debug)] +pub(super) struct UpdateRegionMetadata { + candidate: Peer, +} + +impl UpdateRegionMetadata { + pub(super) fn new(candidate: Peer) -> Self { + Self { candidate } + } + + async fn update_meta( + &self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result<()> { + self.update_table_global_value(ctx, failed_region).await?; + self.update_table_route(ctx, failed_region).await?; + Ok(()) + } + + async fn update_table_global_value( + &self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result<()> { + let key = TableGlobalKey { + catalog_name: failed_region.catalog.clone(), + schema_name: failed_region.schema.clone(), + table_name: failed_region.table.clone(), + }; + let mut value = table_routes::get_table_global_value(&ctx.selector_ctx.kv_store, &key) + .await? + .with_context(|| TableNotFoundSnafu { + name: common_catalog::format_full_table_name( + &key.catalog_name, + &key.schema_name, + &key.table_name, + ), + })?; + + if let Some(mut region_numbers) = value.regions_id_map.remove(&failed_region.datanode_id) { + region_numbers.retain(|x| *x != failed_region.region_number); + + if !region_numbers.is_empty() { + value + .regions_id_map + .insert(failed_region.datanode_id, region_numbers); + } + } + + let region_numbers = value + .regions_id_map + .entry(self.candidate.id) + .or_insert_with(Vec::new); + region_numbers.push(failed_region.region_number); + + table_routes::put_table_global_value(&ctx.selector_ctx.kv_store, &key, &value).await?; + info!( + "Region mappings in table global value (key = '{key}') are updated to {:?}. \ + Failed region {} was on Datanode {}.", + value.regions_id_map, failed_region.region_number, failed_region.datanode_id, + ); + Ok(()) + } + + async fn update_table_route( + &self, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result<()> { + let table_name = TableName { + catalog_name: failed_region.catalog.clone(), + schema_name: failed_region.schema.clone(), + table_name: failed_region.table.clone(), + }; + let key = TableRouteKey::with_table_name(failed_region.table_id as _, &table_name); + let value = table_routes::get_table_route_value(&ctx.selector_ctx.kv_store, &key).await?; + + let table_route = value + .table_route + .with_context(|| CorruptedTableRouteSnafu { + key: key.key(), + reason: "'table_route' is empty", + })?; + let mut table_route = TableRoute::try_from_raw(&value.peers, table_route) + .context(TableRouteConversionSnafu)?; + + for region_route in table_route.region_routes.iter_mut() { + if region_route.region.id == failed_region.region_number as u64 { + region_route.leader_peer = Some(self.candidate.clone()); + break; + } + } + + pretty_log_table_route_change(&key, &table_route, failed_region); + + let (peers, table_route) = table_route + .try_into_raw() + .context(TableRouteConversionSnafu)?; + + let value = TableRouteValue { + peers, + table_route: Some(table_route), + }; + table_routes::put_table_route_value(&ctx.selector_ctx.kv_store, &key, value).await?; + Ok(()) + } +} + +fn pretty_log_table_route_change( + key: &TableRouteKey, + table_route: &TableRoute, + failed_region: &RegionIdent, +) { + let region_routes = table_route + .region_routes + .iter() + .map(|x| { + format!( + "{{region: {}, leader: {}, followers: [{}]}}", + x.region.id, + x.leader_peer + .as_ref() + .map(|p| p.id.to_string()) + .unwrap_or_else(|| "?".to_string()), + x.follower_peers + .iter() + .map(|p| p.id.to_string()) + .collect::>() + .join(","), + ) + }) + .collect::>(); + + info!( + "Updating region routes in table route value (key = '{}') to [{}]. \ + Failed region {} was on Datanode {}.", + key.key(), + region_routes.join(", "), + failed_region.region_number, + failed_region.datanode_id, + ); +} + +#[async_trait] +#[typetag::serde] +impl State for UpdateRegionMetadata { + async fn next( + mut self: Box, + ctx: &RegionFailoverContext, + failed_region: &RegionIdent, + ) -> Result> { + self.update_meta(ctx, failed_region).await.map_err(|e| { + RetryLaterSnafu { + reason: format!( + "Failed to update metadata for failed region: {}, error: {}", + failed_region, e + ), + } + .build() + })?; + Ok(Box::new(RegionFailoverEnd)) + } +} + +#[cfg(test)] +mod tests { + use api::v1::meta::TableRouteValue; + use catalog::helper::TableGlobalValue; + + use super::super::tests::{TestingEnv, TestingEnvBuilder}; + use super::*; + use crate::table_routes::tests::new_region_route; + + #[tokio::test] + async fn test_update_table_global_value() { + common_telemetry::init_default_ut_logging(); + + async fn test(env: TestingEnv, candidate: u64) -> TableGlobalValue { + let TestingEnv { + context, + failed_region, + heartbeat_receivers: _, + } = env; + + let key = TableGlobalKey { + catalog_name: failed_region.catalog.clone(), + schema_name: failed_region.schema.clone(), + table_name: failed_region.table.clone(), + }; + + let original = + table_routes::get_table_global_value(&context.selector_ctx.kv_store, &key) + .await + .unwrap() + .unwrap(); + + let state = UpdateRegionMetadata::new(Peer::new(candidate, "")); + state + .update_table_global_value(&context, &failed_region) + .await + .unwrap(); + + let updated = + table_routes::get_table_global_value(&context.selector_ctx.kv_store, &key) + .await + .unwrap() + .unwrap(); + + // verifies that other data stay untouched + assert_eq!(original.node_id, updated.node_id); + assert_eq!(original.table_info, updated.table_info); + updated + } + + // Region distribution: + // Datanode => Regions + // 1 => 1, 2 + // 2 => 3 + // 3 => 4 + + // Testing failed region 1 moves to Datanode 2. + let env = TestingEnvBuilder::new().with_failed_region(1).build().await; + let updated = test(env, 2).await; + + let new_region_id_map = updated.regions_id_map; + assert_eq!(new_region_id_map.len(), 3); + assert_eq!(new_region_id_map.get(&1), Some(&vec![2])); + assert_eq!(new_region_id_map.get(&2), Some(&vec![3, 1])); + assert_eq!(new_region_id_map.get(&3), Some(&vec![4])); + + // Testing failed region 3 moves to Datanode 3. + let env = TestingEnvBuilder::new().with_failed_region(3).build().await; + let updated = test(env, 3).await; + + let new_region_id_map = updated.regions_id_map; + assert_eq!(new_region_id_map.len(), 2); + assert_eq!(new_region_id_map.get(&1), Some(&vec![1, 2])); + assert_eq!(new_region_id_map.get(&3), Some(&vec![4, 3])); + + // Testing failed region 1 moves to a new Datanode, 4. + let env = TestingEnvBuilder::new().with_failed_region(1).build().await; + let updated = test(env, 4).await; + + let new_region_id_map = updated.regions_id_map; + assert_eq!(new_region_id_map.len(), 4); + assert_eq!(new_region_id_map.get(&1), Some(&vec![2])); + assert_eq!(new_region_id_map.get(&2), Some(&vec![3])); + assert_eq!(new_region_id_map.get(&3), Some(&vec![4])); + assert_eq!(new_region_id_map.get(&4), Some(&vec![1])); + + // Testing failed region 3 moves to a new Datanode, 4. + let env = TestingEnvBuilder::new().with_failed_region(3).build().await; + let updated = test(env, 4).await; + + let new_region_id_map = updated.regions_id_map; + assert_eq!(new_region_id_map.len(), 3); + assert_eq!(new_region_id_map.get(&1), Some(&vec![1, 2])); + assert_eq!(new_region_id_map.get(&3), Some(&vec![4])); + assert_eq!(new_region_id_map.get(&4), Some(&vec![3])); + } + + #[tokio::test] + async fn test_update_table_route() { + common_telemetry::init_default_ut_logging(); + + async fn test(env: TestingEnv, candidate: u64) -> TableRouteValue { + let TestingEnv { + context, + failed_region, + heartbeat_receivers: _, + } = env; + + let state = UpdateRegionMetadata::new(Peer::new(candidate, "")); + state + .update_table_route(&context, &failed_region) + .await + .unwrap(); + + let key = TableRouteKey { + table_id: failed_region.table_id as u64, + catalog_name: &failed_region.catalog, + schema_name: &failed_region.schema, + table_name: &failed_region.table, + }; + table_routes::get_table_route_value(&context.selector_ctx.kv_store, &key) + .await + .unwrap() + } + + // Original region routes: + // region number => leader node + // 1 => 1 + // 2 => 1 + // 3 => 2 + // 4 => 3 + + // Testing failed region 1 moves to Datanode 2. + let env = TestingEnvBuilder::new().with_failed_region(1).build().await; + let updated = test(env, 2).await; + let actual = &updated.table_route.as_ref().unwrap().region_routes; + + // Expected region routes: + // region number => leader node + // 1 => 2 + // 2 => 1 + // 3 => 2 + // 4 => 3 + let peers = &updated.peers; + assert_eq!(peers.len(), 3); + let expected = vec![ + new_region_route(1, peers, 2), + new_region_route(2, peers, 1), + new_region_route(3, peers, 2), + new_region_route(4, peers, 3), + ]; + assert_eq!(actual, &expected); + + // Testing failed region 3 moves to Datanode 3. + let env = TestingEnvBuilder::new().with_failed_region(3).build().await; + let updated = test(env, 3).await; + let actual = &updated.table_route.as_ref().unwrap().region_routes; + + // Expected region routes: + // region number => leader node + // 1 => 1 + // 2 => 1 + // 3 => 3 + // 4 => 3 + let peers = &updated.peers; + assert_eq!(peers.len(), 2); + let expected = vec![ + new_region_route(1, peers, 1), + new_region_route(2, peers, 1), + new_region_route(3, peers, 3), + new_region_route(4, peers, 3), + ]; + assert_eq!(actual, &expected); + + // Testing failed region 1 moves to a new Datanode, 4. + let env = TestingEnvBuilder::new().with_failed_region(1).build().await; + let updated = test(env, 4).await; + let actual = &updated.table_route.as_ref().unwrap().region_routes; + + // Expected region routes: + // region number => leader node + // 1 => 4 + // 2 => 1 + // 3 => 2 + // 4 => 3 + let peers = &updated.peers; + assert_eq!(peers.len(), 4); + let expected = vec![ + new_region_route(1, peers, 4), + new_region_route(2, peers, 1), + new_region_route(3, peers, 2), + new_region_route(4, peers, 3), + ]; + assert_eq!(actual, &expected); + + // Testing failed region 3 moves to a new Datanode, 4. + let env = TestingEnvBuilder::new().with_failed_region(3).build().await; + let updated = test(env, 4).await; + let actual = &updated.table_route.as_ref().unwrap().region_routes; + + // Expected region routes: + // region number => leader node + // 1 => 1 + // 2 => 1 + // 3 => 4 + // 4 => 3 + let peers = &updated.peers; + assert_eq!(peers.len(), 3); + let expected = vec![ + new_region_route(1, peers, 1), + new_region_route(2, peers, 1), + new_region_route(3, peers, 4), + new_region_route(4, peers, 3), + ]; + assert_eq!(actual, &expected); + } +} diff --git a/src/meta-srv/src/selector/lease_based.rs b/src/meta-srv/src/selector/lease_based.rs index 061bcd9f02..13c1c036b8 100644 --- a/src/meta-srv/src/selector/lease_based.rs +++ b/src/meta-srv/src/selector/lease_based.rs @@ -18,14 +18,14 @@ use common_time::util as time_util; use crate::error::Result; use crate::keys::{LeaseKey, LeaseValue}; use crate::lease; -use crate::metasrv::Context; +use crate::metasrv::SelectorContext; use crate::selector::{Namespace, Selector}; pub struct LeaseBasedSelector; #[async_trait::async_trait] impl Selector for LeaseBasedSelector { - type Context = Context; + type Context = SelectorContext; type Output = Vec; async fn select(&self, ns: Namespace, ctx: &Self::Context) -> Result { diff --git a/src/meta-srv/src/selector/load_based.rs b/src/meta-srv/src/selector/load_based.rs index d8de0ab84f..6a12f4b1a1 100644 --- a/src/meta-srv/src/selector/load_based.rs +++ b/src/meta-srv/src/selector/load_based.rs @@ -22,7 +22,7 @@ use crate::cluster::MetaPeerClient; use crate::error::Result; use crate::keys::{LeaseKey, LeaseValue, StatKey}; use crate::lease; -use crate::metasrv::Context; +use crate::metasrv::SelectorContext; use crate::selector::{Namespace, Selector}; const MAX_REGION_NUMBER: u64 = u64::MAX; @@ -33,7 +33,7 @@ pub struct LoadBasedSelector { #[async_trait::async_trait] impl Selector for LoadBasedSelector { - type Context = Context; + type Context = SelectorContext; type Output = Vec; async fn select(&self, ns: Namespace, ctx: &Self::Context) -> Result { diff --git a/src/meta-srv/src/service/heartbeat.rs b/src/meta-srv/src/service/heartbeat.rs index d82ba22bef..1d0d995033 100644 --- a/src/meta-srv/src/service/heartbeat.rs +++ b/src/meta-srv/src/service/heartbeat.rs @@ -175,7 +175,11 @@ mod tests { async fn test_ask_leader() { let kv_store = Arc::new(MemStore::new()); - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = MetaSrvBuilder::new() + .kv_store(kv_store) + .build() + .await + .unwrap(); let req = AskLeaderRequest { header: Some(RequestHeader::new((1, 1), Role::Datanode)), diff --git a/src/meta-srv/src/service/mailbox.rs b/src/meta-srv/src/service/mailbox.rs index 32535e5947..538da3e552 100644 --- a/src/meta-srv/src/service/mailbox.rs +++ b/src/meta-srv/src/service/mailbox.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use std::task::{Context, Poll}; use std::time::Duration; -use api::v1::meta::MailboxMessage; +use api::v1::meta::{MailboxMessage, Role}; use futures::Future; use tokio::sync::oneshot; @@ -32,6 +32,15 @@ pub enum Channel { Frontend(u64), } +impl Channel { + pub(crate) fn pusher_id(&self) -> String { + match self { + Channel::Datanode(id) => format!("{}-{}", Role::Datanode as i32, id), + Channel::Frontend(id) => format!("{}-{}", Role::Frontend as i32, id), + } + } +} + pub struct MailboxReceiver { message_id: MessageId, rx: oneshot::Receiver>, @@ -74,3 +83,14 @@ pub trait Mailbox: Send + Sync { async fn on_recv(&self, id: MessageId, maybe_msg: Result) -> Result<()>; } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_channel_pusher_id() { + assert_eq!(Channel::Datanode(42).pusher_id(), "0-42"); + assert_eq!(Channel::Frontend(42).pusher_id(), "1-42"); + } +} diff --git a/src/meta-srv/src/service/router.rs b/src/meta-srv/src/service/router.rs index bc4385b60e..f0a95cd3cc 100644 --- a/src/meta-srv/src/service/router.rs +++ b/src/meta-srv/src/service/router.rs @@ -17,7 +17,7 @@ use std::collections::HashMap; use api::v1::meta::{ router_server, BatchPutRequest, CreateRequest, DeleteRequest, Error, KeyValue, MoveValueRequest, Peer, PeerDict, Region, RegionRoute, ResponseHeader, RouteRequest, - RouteResponse, Table, TableName, TableRoute, TableRouteValue, + RouteResponse, Table, TableRoute, TableRouteValue, }; use catalog::helper::{TableGlobalKey, TableGlobalValue}; use common_telemetry::{timer, warn}; @@ -28,12 +28,12 @@ use tonic::{Request, Response}; use crate::error; use crate::error::Result; use crate::keys::TableRouteKey; -use crate::metasrv::{Context, MetaSrv, SelectorRef}; +use crate::metasrv::{Context, MetaSrv, SelectorContext, SelectorRef}; use crate::metrics::METRIC_META_ROUTE_REQUEST; use crate::sequence::SequenceRef; -use crate::service::store::ext::KvStoreExt; use crate::service::store::kv::KvStoreRef; use crate::service::GrpcResult; +use crate::table_routes::{get_table_global_value, get_table_route_value}; #[async_trait::async_trait] impl router_server::Router for MetaSrv { @@ -51,7 +51,13 @@ impl router_server::Router for MetaSrv { ); let table_name = table_name.clone().context(error::EmptyTableNameSnafu)?; - let ctx = self.create_ctx(table_name); + let ctx = SelectorContext { + datanode_lease_secs: self.options().datanode_lease_secs, + server_addr: self.options().server_addr.clone(), + kv_store: self.kv_store(), + catalog: Some(table_name.catalog_name), + schema: Some(table_name.schema_name), + }; let selector = self.selector(); let table_id_sequence = self.table_id_sequence(); @@ -92,24 +98,9 @@ impl router_server::Router for MetaSrv { } } -impl MetaSrv { - fn create_ctx(&self, table_name: TableName) -> Context { - let mut ctx = self.new_ctx(); - let TableName { - catalog_name, - schema_name, - table_name, - } = table_name; - ctx.catalog = Some(catalog_name); - ctx.schema = Some(schema_name); - ctx.table = Some(table_name); - ctx - } -} - async fn handle_create( req: CreateRequest, - ctx: Context, + ctx: SelectorContext, selector: SelectorRef, table_id_sequence: SequenceRef, ) -> Result { @@ -374,23 +365,6 @@ async fn fetch_tables( Ok(tables) } -async fn get_table_route_value( - kv_store: &KvStoreRef, - key: &TableRouteKey<'_>, -) -> Result { - let trkv = kv_store - .get(key.key().into_bytes()) - .await? - .context(error::TableRouteNotFoundSnafu { key: key.key() })?; - let trv: TableRouteValue = trkv - .value - .as_slice() - .try_into() - .context(error::DecodeTableRouteSnafu)?; - - Ok(trv) -} - async fn remove_table_route_value( kv_store: &KvStoreRef, key: &TableRouteKey<'_>, @@ -422,22 +396,6 @@ async fn remove_table_global_value( Ok((kv.0, value)) } -async fn get_table_global_value( - kv_store: &KvStoreRef, - key: &TableGlobalKey, -) -> Result> { - let tg_key = format!("{key}").into_bytes(); - let tkv = kv_store.get(tg_key).await?; - match tkv { - Some(tkv) => { - let tv = - TableGlobalValue::from_bytes(tkv.value).context(error::InvalidCatalogValueSnafu)?; - Ok(Some(tv)) - } - None => Ok(None), - } -} - async fn move_value( kv_store: &KvStoreRef, from_key: impl Into>, diff --git a/src/meta-srv/src/service/store.rs b/src/meta-srv/src/service/store.rs index bbc6f248e8..926bd2f663 100644 --- a/src/meta-srv/src/service/store.rs +++ b/src/meta-srv/src/service/store.rs @@ -104,13 +104,20 @@ mod tests { use tonic::IntoRequest; use crate::metasrv::builder::MetaSrvBuilder; + use crate::metasrv::MetaSrv; use crate::service::store::memory::MemStore; + async fn new_meta_srv() -> MetaSrv { + MetaSrvBuilder::new() + .kv_store(Arc::new(MemStore::new())) + .build() + .await + .unwrap() + } + #[tokio::test] async fn test_range() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = RangeRequest::default(); let res = meta_srv.range(req.into_request()).await; @@ -120,9 +127,7 @@ mod tests { #[tokio::test] async fn test_put() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = PutRequest::default(); let res = meta_srv.put(req.into_request()).await; @@ -132,9 +137,7 @@ mod tests { #[tokio::test] async fn test_batch_get() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = BatchGetRequest::default(); let res = meta_srv.batch_get(req.into_request()).await; @@ -144,9 +147,7 @@ mod tests { #[tokio::test] async fn test_batch_put() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = BatchPutRequest::default(); let res = meta_srv.batch_put(req.into_request()).await; @@ -156,9 +157,7 @@ mod tests { #[tokio::test] async fn test_batch_delete() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = BatchDeleteRequest::default(); let res = meta_srv.batch_delete(req.into_request()).await; @@ -168,9 +167,7 @@ mod tests { #[tokio::test] async fn test_compare_and_put() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = CompareAndPutRequest::default(); let res = meta_srv.compare_and_put(req.into_request()).await; @@ -180,9 +177,7 @@ mod tests { #[tokio::test] async fn test_delete_range() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = DeleteRangeRequest::default(); let res = meta_srv.delete_range(req.into_request()).await; @@ -192,9 +187,7 @@ mod tests { #[tokio::test] async fn test_move_value() { - let kv_store = Arc::new(MemStore::new()); - - let meta_srv = MetaSrvBuilder::new().kv_store(kv_store).build().await; + let meta_srv = new_meta_srv().await; let req = MoveValueRequest::default(); let res = meta_srv.move_value(req.into_request()).await; diff --git a/src/meta-srv/src/table_routes.rs b/src/meta-srv/src/table_routes.rs new file mode 100644 index 0000000000..12fe8fa7b5 --- /dev/null +++ b/src/meta-srv/src/table_routes.rs @@ -0,0 +1,268 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::meta::{PutRequest, TableRouteValue}; +use catalog::helper::{TableGlobalKey, TableGlobalValue}; +use snafu::{OptionExt, ResultExt}; + +use crate::error::{ + DecodeTableRouteSnafu, InvalidCatalogValueSnafu, Result, TableRouteNotFoundSnafu, +}; +use crate::keys::TableRouteKey; +use crate::service::store::ext::KvStoreExt; +use crate::service::store::kv::KvStoreRef; + +pub async fn get_table_global_value( + kv_store: &KvStoreRef, + key: &TableGlobalKey, +) -> Result> { + let key = key.to_string().into_bytes(); + let kv = kv_store.get(key).await?; + kv.map(|kv| TableGlobalValue::from_bytes(kv.value).context(InvalidCatalogValueSnafu)) + .transpose() +} + +pub(crate) async fn put_table_global_value( + kv_store: &KvStoreRef, + key: &TableGlobalKey, + value: &TableGlobalValue, +) -> Result<()> { + let req = PutRequest { + header: None, + key: key.to_string().into_bytes(), + value: value.as_bytes().context(InvalidCatalogValueSnafu)?, + prev_kv: false, + }; + let _ = kv_store.put(req).await; + Ok(()) +} + +pub(crate) async fn get_table_route_value( + kv_store: &KvStoreRef, + key: &TableRouteKey<'_>, +) -> Result { + let kv = kv_store + .get(key.key().into_bytes()) + .await? + .with_context(|| TableRouteNotFoundSnafu { key: key.key() })?; + kv.value + .as_slice() + .try_into() + .context(DecodeTableRouteSnafu) +} + +pub(crate) async fn put_table_route_value( + kv_store: &KvStoreRef, + key: &TableRouteKey<'_>, + value: TableRouteValue, +) -> Result<()> { + let req = PutRequest { + header: None, + key: key.key().into_bytes(), + value: value.into(), + prev_kv: false, + }; + let _ = kv_store.put(req).await?; + Ok(()) +} + +#[cfg(test)] +pub(crate) mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use api::v1::meta::{Peer, Region, RegionRoute, Table, TableName, TableRoute}; + use chrono::DateTime; + use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO_ENGINE}; + use datatypes::data_type::ConcreteDataType; + use datatypes::schema::{ColumnSchema, RawSchema}; + use table::metadata::{RawTableInfo, RawTableMeta, TableIdent, TableType}; + use table::requests::TableOptions; + + use super::*; + use crate::error; + use crate::service::store::memory::MemStore; + + pub(crate) async fn prepare_table_global_value( + kv_store: &KvStoreRef, + table: &str, + ) -> (TableGlobalKey, TableGlobalValue) { + // Region distribution: + // Datanode => Regions + // 1 => 1, 2 + // 2 => 3 + // 3 => 4 + let regions_id_map = HashMap::from([(1, vec![1, 2]), (2, vec![3]), (3, vec![4])]); + + let key = TableGlobalKey { + catalog_name: DEFAULT_CATALOG_NAME.to_string(), + schema_name: DEFAULT_SCHEMA_NAME.to_string(), + table_name: table.to_string(), + }; + let value = TableGlobalValue { + node_id: 1, + regions_id_map, + table_info: RawTableInfo { + ident: TableIdent::new(1), + name: table.to_string(), + desc: None, + catalog_name: DEFAULT_CATALOG_NAME.to_string(), + schema_name: DEFAULT_SCHEMA_NAME.to_string(), + meta: RawTableMeta { + schema: RawSchema::new(vec![ColumnSchema::new( + "a", + ConcreteDataType::string_datatype(), + true, + )]), + primary_key_indices: vec![], + value_indices: vec![], + engine: MITO_ENGINE.to_string(), + next_column_id: 1, + region_numbers: vec![1, 2, 3, 4], + engine_options: HashMap::new(), + options: TableOptions::default(), + created_on: DateTime::default(), + }, + table_type: TableType::Base, + }, + }; + put_table_global_value(kv_store, &key, &value) + .await + .unwrap(); + (key, value) + } + + pub(crate) async fn prepare_table_route_value<'a>( + kv_store: &'a KvStoreRef, + table: &'a str, + ) -> (TableRouteKey<'a>, TableRouteValue) { + let key = TableRouteKey { + table_id: 1, + catalog_name: DEFAULT_CATALOG_NAME, + schema_name: DEFAULT_SCHEMA_NAME, + table_name: table, + }; + + let peers = (1..=3) + .map(|id| Peer { + id, + addr: "".to_string(), + }) + .collect::>(); + + // region routes: + // region number => leader node + // 1 => 1 + // 2 => 1 + // 3 => 2 + // 4 => 3 + let region_routes = vec![ + new_region_route(1, &peers, 1), + new_region_route(2, &peers, 1), + new_region_route(3, &peers, 2), + new_region_route(4, &peers, 3), + ]; + let table_route = TableRoute { + table: Some(Table { + id: 1, + table_name: Some(TableName { + catalog_name: DEFAULT_CATALOG_NAME.to_string(), + schema_name: DEFAULT_SCHEMA_NAME.to_string(), + table_name: table.to_string(), + }), + table_schema: vec![], + }), + region_routes, + }; + let value = TableRouteValue { + peers, + table_route: Some(table_route), + }; + put_table_route_value(kv_store, &key, value.clone()) + .await + .unwrap(); + (key, value) + } + + pub(crate) fn new_region_route( + region_number: u64, + peers: &[Peer], + leader_node: u64, + ) -> RegionRoute { + let region = Region { + id: region_number, + name: "".to_string(), + partition: None, + attrs: HashMap::new(), + }; + let leader_peer_index = peers + .iter() + .enumerate() + .find_map(|(i, peer)| { + if peer.id == leader_node { + Some(i as u64) + } else { + None + } + }) + .unwrap(); + RegionRoute { + region: Some(region), + leader_peer_index, + follower_peer_indexes: vec![], + } + } + + #[tokio::test] + async fn test_put_and_get_table_global_value() { + let kv_store = Arc::new(MemStore::new()) as _; + + let key = TableGlobalKey { + catalog_name: "not_exist_catalog".to_string(), + schema_name: "not_exist_schema".to_string(), + table_name: "not_exist_table".to_string(), + }; + assert!(get_table_global_value(&kv_store, &key) + .await + .unwrap() + .is_none()); + + let (key, value) = prepare_table_global_value(&kv_store, "my_table").await; + let actual = get_table_global_value(&kv_store, &key) + .await + .unwrap() + .unwrap(); + assert_eq!(actual, value); + } + + #[tokio::test] + async fn test_put_and_get_table_route_value() { + let kv_store = Arc::new(MemStore::new()) as _; + + let key = TableRouteKey { + table_id: 1, + catalog_name: "not_exist_catalog", + schema_name: "not_exist_schema", + table_name: "not_exist_table", + }; + assert!(matches!( + get_table_route_value(&kv_store, &key).await.unwrap_err(), + error::Error::TableRouteNotFound { .. } + )); + + let (key, value) = prepare_table_route_value(&kv_store, "my_table").await; + let actual = get_table_route_value(&kv_store, &key).await.unwrap(); + assert_eq!(actual, value); + } +} diff --git a/src/meta-srv/src/test_util.rs b/src/meta-srv/src/test_util.rs new file mode 100644 index 0000000000..c3a3a575d2 --- /dev/null +++ b/src/meta-srv/src/test_util.rs @@ -0,0 +1,52 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_procedure::local::{LocalManager, ManagerConfig}; + +use crate::handler::{HeartbeatMailbox, Pushers}; +use crate::metasrv::SelectorContext; +use crate::procedure::region_failover::RegionFailoverManager; +use crate::procedure::state_store::MetaStateStore; +use crate::selector::lease_based::LeaseBasedSelector; +use crate::sequence::Sequence; +use crate::service::store::memory::MemStore; + +pub(crate) fn create_region_failover_manager() -> Arc { + let kv_store = Arc::new(MemStore::new()); + + let pushers = Pushers::default(); + let mailbox_sequence = Sequence::new("test_heartbeat_mailbox", 0, 100, kv_store.clone()); + let mailbox = HeartbeatMailbox::create(pushers, mailbox_sequence); + + let state_store = Arc::new(MetaStateStore::new(kv_store.clone())); + let procedure_manager = Arc::new(LocalManager::new(ManagerConfig::default(), state_store)); + + let selector = Arc::new(LeaseBasedSelector); + let selector_ctx = SelectorContext { + datanode_lease_secs: 10, + server_addr: "127.0.0.1:3002".to_string(), + kv_store, + catalog: None, + schema: None, + }; + + Arc::new(RegionFailoverManager::new( + mailbox, + procedure_manager, + selector, + selector_ctx, + )) +} diff --git a/src/partition/Cargo.toml b/src/partition/Cargo.toml index d47c85b0bb..fc8839e45f 100644 --- a/src/partition/Cargo.toml +++ b/src/partition/Cargo.toml @@ -10,6 +10,7 @@ license.workspace = true common-catalog = { path = "../common/catalog" } common-error = { path = "../common/error" } common-query = { path = "../common/query" } +common-meta = { path = "../common/meta" } datafusion-common.workspace = true datafusion-expr.workspace = true datafusion.workspace = true diff --git a/src/partition/src/manager.rs b/src/partition/src/manager.rs index 3e3fcd64c2..3ceeb61f20 100644 --- a/src/partition/src/manager.rs +++ b/src/partition/src/manager.rs @@ -15,11 +15,13 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use common_meta::peer::Peer; +use common_meta::router::TableRoute; +use common_meta::table_name::TableName; use common_query::prelude::Expr; use datafusion_expr::{BinaryExpr, Expr as DfExpr, Operator}; use datatypes::prelude::Value; use datatypes::schema::Schema; -use meta_client::rpc::{Peer, TableName, TableRoute}; use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::{RegionId, RegionNumber}; use table::requests::InsertRequest; diff --git a/src/partition/src/partition.rs b/src/partition/src/partition.rs index 0b107fa6b7..583fb87de2 100644 --- a/src/partition/src/partition.rs +++ b/src/partition/src/partition.rs @@ -16,9 +16,9 @@ use std::any::Any; use std::fmt::Debug; use std::sync::Arc; +use common_meta::router::Partition as MetaPartition; use datafusion_expr::Operator; use datatypes::prelude::Value; -use meta_client::rpc::Partition as MetaPartition; use serde::{Deserialize, Serialize}; use snafu::ResultExt; use store_api::storage::RegionNumber; diff --git a/src/partition/src/route.rs b/src/partition/src/route.rs index 0c91ecba5f..5dcbc2e730 100644 --- a/src/partition/src/route.rs +++ b/src/partition/src/route.rs @@ -15,8 +15,9 @@ use std::sync::Arc; use std::time::Duration; +use common_meta::router::{RouteRequest, TableRoute}; +use common_meta::table_name::TableName; use meta_client::client::MetaClient; -use meta_client::rpc::{RouteRequest, TableName, TableRoute}; use moka::future::{Cache, CacheBuilder}; use snafu::{ensure, ResultExt}; diff --git a/tests-integration/Cargo.toml b/tests-integration/Cargo.toml index 6ba5644f4c..1d6a4739f1 100644 --- a/tests-integration/Cargo.toml +++ b/tests-integration/Cargo.toml @@ -13,10 +13,13 @@ axum = "0.6" axum-test-helper = { git = "https://github.com/sunng87/axum-test-helper.git", branch = "patch-1" } catalog = { path = "../src/catalog" } client = { path = "../src/client" } +common-base = { path = "../src/common/base" } common-catalog = { path = "../src/common/catalog" } common-error = { path = "../src/common/error" } common-grpc = { path = "../src/common/grpc" } +common-meta = { path = "../src/common/meta" } common-query = { path = "../src/common/query" } +common-recordbatch = { path = "../src/common/recordbatch" } common-runtime = { path = "../src/common/runtime" } common-telemetry = { path = "../src/common/telemetry" } common-test-util = { path = "../src/common/test-util" } @@ -24,38 +27,37 @@ datanode = { path = "../src/datanode" } datatypes = { path = "../src/datatypes" } dotenv = "0.15" frontend = { path = "../src/frontend", features = ["testing"] } +futures.workspace = true +meta-client = { path = "../src/meta-client" } +meta-srv = { path = "../src/meta-srv" } mito = { path = "../src/mito", features = ["test"] } object-store = { path = "../src/object-store" } once_cell = "1.16" +query = { path = "../src/query" } rand.workspace = true +rstest = "0.17" +rstest_reuse = "0.5" secrecy = "0.8" serde.workspace = true serde_json = "1.0" servers = { path = "../src/servers" } +session = { path = "../src/session" } snafu.workspace = true sql = { path = "../src/sql" } table = { path = "../src/table" } tempfile.workspace = true tokio.workspace = true +tonic.workspace = true +tower = "0.4" uuid.workspace = true [dev-dependencies] -common-base = { path = "../src/common/base" } -common-recordbatch = { path = "../src/common/recordbatch" } +common-procedure = { path = "../src/common/procedure" } datafusion.workspace = true datafusion-expr.workspace = true -futures.workspace = true itertools = "0.10" -meta-client = { path = "../src/meta-client" } -meta-srv = { path = "../src/meta-srv" } partition = { path = "../src/partition" } paste.workspace = true prost.workspace = true -query = { path = "../src/query" } script = { path = "../src/script" } -session = { path = "../src/session" } store-api = { path = "../src/store-api" } -tonic.workspace = true -tower = "0.4" -rstest = "0.17" -rstest_reuse = "0.5" diff --git a/tests-integration/src/catalog.rs b/tests-integration/src/catalog.rs index 5e3187ac2c..b05e1045a5 100644 --- a/tests-integration/src/catalog.rs +++ b/tests-integration/src/catalog.rs @@ -14,7 +14,7 @@ #[cfg(test)] mod tests { - use catalog::{CatalogManager, RegisterSystemTableRequest}; + use catalog::RegisterSystemTableRequest; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO_ENGINE}; use script::table::{build_scripts_schema, SCRIPTS_TABLE_NAME}; use table::requests::{CreateTableRequest, TableOptions}; @@ -42,7 +42,8 @@ mod tests { }; let result = instance - .catalog_manager + .frontend() + .catalog_manager() .register_system_table(RegisterSystemTableRequest { create_table_request: request, open_hook: None, @@ -52,7 +53,8 @@ mod tests { assert!( instance - .catalog_manager + .frontend() + .catalog_manager() .table(catalog_name, schema_name, table_name) .await .unwrap() @@ -61,7 +63,7 @@ mod tests { ); let mut actually_created_table_in_datanode = 0; - for datanode in instance.datanodes.values() { + for datanode in instance.datanodes().values() { if datanode .catalog_manager() .table(catalog_name, schema_name, table_name) diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs new file mode 100644 index 0000000000..3ce62dacf2 --- /dev/null +++ b/tests-integration/src/cluster.rs @@ -0,0 +1,297 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +use api::v1::meta::Role; +use catalog::remote::RemoteCatalogManager; +use client::Client; +use common_base::Plugins; +use common_grpc::channel_manager::ChannelManager; +use common_meta::peer::Peer; +use common_meta::DatanodeId; +use common_runtime::Builder as RuntimeBuilder; +use common_test_util::temp_dir::create_temp_dir; +use datanode::datanode::{DatanodeOptions, ObjectStoreConfig}; +use datanode::instance::Instance as DatanodeInstance; +use frontend::datanode::DatanodeClients; +use frontend::instance::Instance as FrontendInstance; +use meta_client::client::MetaClientBuilder; +use meta_srv::metasrv::{MetaSrv, MetaSrvOptions}; +use meta_srv::mocks::MockInfo; +use meta_srv::service::store::kv::KvStoreRef; +use meta_srv::service::store::memory::MemStore; +use servers::grpc::GrpcServer; +use servers::query_handler::grpc::ServerGrpcQueryHandlerAdaptor; +use servers::Mode; +use tonic::transport::Server; +use tower::service_fn; + +use crate::test_util::{ + create_datanode_opts, create_tmp_dir_and_datanode_opts, StorageGuard, StorageType, WalGuard, +}; + +pub struct GreptimeDbCluster { + pub storage_guards: Vec, + _wal_guards: Vec, + + pub datanode_instances: HashMap>, + pub kv_store: KvStoreRef, + pub meta_srv: MetaSrv, + pub frontend: Arc, +} + +pub struct GreptimeDbClusterBuilder { + cluster_name: String, + kv_store: KvStoreRef, + store_config: Option, + datanodes: Option, +} + +impl GreptimeDbClusterBuilder { + pub fn new(cluster_name: &str) -> Self { + Self { + cluster_name: cluster_name.to_string(), + kv_store: Arc::new(MemStore::default()), + store_config: None, + datanodes: None, + } + } + + pub fn with_store_config(mut self, store_config: ObjectStoreConfig) -> Self { + self.store_config = Some(store_config); + self + } + + pub fn with_datanodes(mut self, datanodes: u32) -> Self { + self.datanodes = Some(datanodes); + self + } + + pub async fn build(self) -> GreptimeDbCluster { + let datanodes = self.datanodes.unwrap_or(4); + + let meta_srv = self.build_metasrv().await; + + let (datanode_instances, storage_guards, wal_guards) = + self.build_datanodes(meta_srv.clone(), datanodes).await; + + let datanode_clients = build_datanode_clients(&datanode_instances, datanodes).await; + + self.wait_datanodes_alive(datanodes).await; + + let frontend = self + .build_frontend(meta_srv.clone(), datanode_clients) + .await; + + GreptimeDbCluster { + storage_guards, + _wal_guards: wal_guards, + datanode_instances, + kv_store: self.kv_store.clone(), + meta_srv: meta_srv.meta_srv, + frontend, + } + } + + async fn build_metasrv(&self) -> MockInfo { + meta_srv::mocks::mock(MetaSrvOptions::default(), self.kv_store.clone(), None).await + } + + async fn build_datanodes( + &self, + meta_srv: MockInfo, + datanodes: u32, + ) -> ( + HashMap>, + Vec, + Vec, + ) { + let mut instances = HashMap::with_capacity(datanodes as usize); + let mut storage_guards = Vec::with_capacity(datanodes as usize); + let mut wal_guards = Vec::with_capacity(datanodes as usize); + + for i in 0..datanodes { + let datanode_id = i as u64 + 1; + + let mut opts = if let Some(store_config) = &self.store_config { + let wal_tmp_dir = create_temp_dir(&format!("gt_wal_{}", &self.cluster_name)); + let wal_dir = wal_tmp_dir.path().to_str().unwrap().to_string(); + wal_guards.push(WalGuard(wal_tmp_dir)); + + create_datanode_opts(store_config.clone(), wal_dir) + } else { + let (opts, guard) = create_tmp_dir_and_datanode_opts( + StorageType::File, + &format!("{}-dn-{}", self.cluster_name, datanode_id), + ); + + storage_guards.push(guard.storage_guard); + wal_guards.push(guard.wal_guard); + + opts + }; + opts.node_id = Some(datanode_id); + opts.mode = Mode::Distributed; + + let dn_instance = self.create_datanode(&opts, meta_srv.clone()).await; + + instances.insert(datanode_id, dn_instance.clone()); + } + (instances, storage_guards, wal_guards) + } + + async fn wait_datanodes_alive(&self, expected_datanodes: u32) { + let kv_store = self.kv_store(); + for _ in 0..10 { + let alive_datanodes = meta_srv::lease::alive_datanodes(1000, &kv_store, |_, _| true) + .await + .unwrap() + .len() as u32; + if alive_datanodes == expected_datanodes { + return; + } + tokio::time::sleep(Duration::from_secs(1)).await + } + panic!("Some Datanodes are not alive in 10 seconds!") + } + + async fn create_datanode( + &self, + opts: &DatanodeOptions, + meta_srv: MockInfo, + ) -> Arc { + let instance = Arc::new( + DatanodeInstance::with_mock_meta_server(opts, meta_srv) + .await + .unwrap(), + ); + instance.start().await.unwrap(); + + // create another catalog and schema for testing + let _ = instance + .catalog_manager() + .as_any() + .downcast_ref::() + .unwrap() + .create_catalog_and_schema("another_catalog", "another_schema") + .await + .unwrap(); + instance + } + + async fn build_frontend( + &self, + meta_srv: MockInfo, + datanode_clients: Arc, + ) -> Arc { + let mut meta_client = MetaClientBuilder::new(1000, 0, Role::Frontend) + .enable_router() + .enable_store() + .channel_manager(meta_srv.channel_manager) + .build(); + meta_client.start(&[&meta_srv.server_addr]).await.unwrap(); + let meta_client = Arc::new(meta_client); + + Arc::new( + FrontendInstance::try_new_distributed_with( + meta_client, + datanode_clients, + Arc::new(Plugins::default()), + ) + .await + .unwrap(), + ) + } + + fn kv_store(&self) -> KvStoreRef { + self.kv_store.clone() + } +} + +async fn build_datanode_clients( + instances: &HashMap>, + datanodes: u32, +) -> Arc { + let clients = Arc::new(DatanodeClients::default()); + for i in 0..datanodes { + let datanode_id = i as u64 + 1; + let instance = instances.get(&datanode_id).cloned().unwrap(); + let (addr, client) = create_datanode_client(instance).await; + clients + .insert_client(Peer::new(datanode_id, addr), client) + .await; + } + clients +} + +async fn create_datanode_client(datanode_instance: Arc) -> (String, Client) { + let (client, server) = tokio::io::duplex(1024); + + let runtime = Arc::new( + RuntimeBuilder::default() + .worker_threads(2) + .thread_name("grpc-handlers") + .build() + .unwrap(), + ); + + // create a mock datanode grpc service, see example here: + // https://github.com/hyperium/tonic/blob/master/examples/src/mock/mock.rs + let grpc_server = GrpcServer::new( + ServerGrpcQueryHandlerAdaptor::arc(datanode_instance), + None, + None, + runtime, + ); + tokio::spawn(async move { + Server::builder() + .add_service(grpc_server.create_flight_service()) + .add_service(grpc_server.create_database_service()) + .serve_with_incoming(futures::stream::iter(vec![Ok::<_, std::io::Error>(server)])) + .await + }); + + // Move client to an option so we can _move_ the inner value + // on the first attempt to connect. All other attempts will fail. + let mut client = Some(client); + // "127.0.0.1:3001" is just a placeholder, does not actually connect to it. + let addr = "127.0.0.1:3001"; + let channel_manager = ChannelManager::new(); + channel_manager + .reset_with_connector( + addr, + service_fn(move |_| { + let client = client.take(); + + async move { + if let Some(client) = client { + Ok(client) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Client already taken", + )) + } + } + }), + ) + .unwrap(); + ( + addr.to_string(), + Client::with_manager_and_urls(channel_manager, vec![addr]), + ) +} diff --git a/tests-integration/src/grpc.rs b/tests-integration/src/grpc.rs index 01252ec733..1fd6a49476 100644 --- a/tests-integration/src/grpc.rs +++ b/tests-integration/src/grpc.rs @@ -43,9 +43,8 @@ mod test { async fn test_distributed_handle_ddl_request() { let instance = tests::create_distributed_instance("test_distributed_handle_ddl_request").await; - let frontend = &instance.frontend; - test_handle_ddl_request(frontend.as_ref()).await; + test_handle_ddl_request(instance.frontend().as_ref()).await; verify_table_is_dropped(&instance).await; } @@ -158,7 +157,7 @@ mod test { } async fn verify_table_is_dropped(instance: &MockDistributedInstance) { - for (_, dn) in instance.datanodes.iter() { + for (_, dn) in instance.datanodes().iter() { assert!(dn .catalog_manager() .table( @@ -178,7 +177,8 @@ mod test { let instance = tests::create_distributed_instance("test_distributed_insert_delete_and_query").await; - let frontend = instance.frontend.as_ref(); + let frontend = instance.frontend(); + let frontend = frontend.as_ref(); let table_name = "my_dist_table"; let sql = format!( @@ -296,7 +296,8 @@ CREATE TABLE {table_name} ( let instance = tests::create_distributed_instance("test_distributed_flush_table").await; let data_tmp_dirs = instance.data_tmp_dirs(); - let frontend = instance.frontend.as_ref(); + let frontend = instance.frontend(); + let frontend = frontend.as_ref(); let table_name = "my_dist_table"; let sql = format!( @@ -320,8 +321,7 @@ CREATE TABLE {table_name} ( // Wait for previous task finished flush_table(frontend, "greptime", "public", table_name, None).await; - let table = instance - .frontend + let table = frontend .catalog_manager() .table("greptime", "public", table_name) .await @@ -593,7 +593,7 @@ CREATE TABLE {table_name} ( expected_distribution: HashMap, ) { let table = instance - .frontend + .frontend() .catalog_manager() .table("greptime", "public", table_name) .await @@ -621,7 +621,7 @@ CREATE TABLE {table_name} ( "SELECT ts, a, b FROM {table_name} ORDER BY ts" )) .unwrap(); - let dn = instance.datanodes.get(dn).unwrap(); + let dn = instance.datanodes().get(dn).unwrap(); let engine = dn.query_engine(); let plan = engine .planner() diff --git a/tests-integration/src/influxdb.rs b/tests-integration/src/influxdb.rs index e77880897f..72860fd6df 100644 --- a/tests-integration/src/influxdb.rs +++ b/tests-integration/src/influxdb.rs @@ -39,9 +39,7 @@ mod test { async fn test_distributed_put_influxdb_lines() { let instance = tests::create_distributed_instance("test_distributed_put_influxdb_lines").await; - let instance = &instance.frontend; - - test_put_influxdb_lines(instance).await; + test_put_influxdb_lines(&instance.frontend()).await; } async fn test_put_influxdb_lines(instance: &Arc) { diff --git a/tests-integration/src/instance.rs b/tests-integration/src/instance.rs index d4876db6d7..2fd3c3b6d3 100644 --- a/tests-integration/src/instance.rs +++ b/tests-integration/src/instance.rs @@ -60,7 +60,8 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_distributed_exec_sql() { let distributed = tests::create_distributed_instance("test_distributed_exec_sql").await; - let instance = distributed.frontend.as_ref(); + let frontend = distributed.frontend(); + let instance = frontend.as_ref(); let sql = r#" CREATE TABLE demo( @@ -177,7 +178,7 @@ mod tests { expected_distribution: HashMap, ) { let table = instance - .frontend + .frontend() .catalog_manager() .table("greptime", "public", "demo") .await @@ -202,7 +203,7 @@ mod tests { let stmt = QueryLanguageParser::parse_sql("SELECT ts, host FROM demo ORDER BY ts").unwrap(); for (region, dn) in region_to_dn_map.iter() { - let dn = instance.datanodes.get(dn).unwrap(); + let dn = instance.datanodes().get(dn).unwrap(); let engine = dn.query_engine(); let plan = engine .planner() @@ -227,7 +228,7 @@ mod tests { } async fn verify_table_is_dropped(instance: &MockDistributedInstance) { - for (_, dn) in instance.datanodes.iter() { + for (_, dn) in instance.datanodes().iter() { assert!(dn .catalog_manager() .table("greptime", "public", "demo") diff --git a/tests-integration/src/lib.rs b/tests-integration/src/lib.rs index 3a3642b49a..c3c3880902 100644 --- a/tests-integration/src/lib.rs +++ b/tests-integration/src/lib.rs @@ -13,6 +13,7 @@ // limitations under the License. mod catalog; +pub mod cluster; mod grpc; mod influxdb; mod instance; @@ -21,6 +22,7 @@ mod prometheus; mod table; pub mod test_util; +// TODO(LFC): Refactor: move instance structs out of mod "tests", like the `GreptimeDbCluster`. #[cfg(test)] mod tests; diff --git a/tests-integration/src/opentsdb.rs b/tests-integration/src/opentsdb.rs index b61fd88ce1..6114e19926 100644 --- a/tests-integration/src/opentsdb.rs +++ b/tests-integration/src/opentsdb.rs @@ -38,9 +38,7 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_distributed_exec() { let distributed = tests::create_distributed_instance("test_distributed_exec").await; - let instance = &distributed.frontend; - - test_exec(instance).await; + test_exec(&distributed.frontend()).await; } async fn test_exec(instance: &Arc) { diff --git a/tests-integration/src/prometheus.rs b/tests-integration/src/prometheus.rs index 6bd20c70fc..3c53d80767 100644 --- a/tests-integration/src/prometheus.rs +++ b/tests-integration/src/prometheus.rs @@ -43,9 +43,7 @@ mod tests { async fn test_distributed_prometheus_remote_rw() { let distributed = tests::create_distributed_instance("test_distributed_prometheus_remote_rw").await; - let instance = &distributed.frontend; - - test_prometheus_remote_rw(instance).await; + test_prometheus_remote_rw(&distributed.frontend()).await; } async fn test_prometheus_remote_rw(instance: &Arc) { diff --git a/tests-integration/src/table.rs b/tests-integration/src/table.rs index 42e25ec9a4..395f5ef5cd 100644 --- a/tests-integration/src/table.rs +++ b/tests-integration/src/table.rs @@ -19,6 +19,7 @@ mod test { use api::v1::column::SemanticType; use api::v1::{column, Column, ColumnDataType, InsertRequest as GrpcInsertRequest}; + use common_meta::table_name::TableName; use common_query::logical_plan::Expr; use common_query::physical_plan::DfPhysicalPlanAdapter; use common_query::DfPhysicalPlan; @@ -28,20 +29,17 @@ mod test { use datafusion::physical_plan::expressions::{col as physical_col, PhysicalSortExpr}; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::prelude::SessionContext; - use datafusion::sql::sqlparser; use datafusion_expr::expr_fn::{and, binary_expr, col}; use datafusion_expr::{lit, Operator}; use datanode::instance::Instance; use datatypes::arrow::compute::SortOptions; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; - use frontend::expr_factory; + use frontend::catalog::FrontendCatalogManager; use frontend::table::DistTable; use itertools::Itertools; - use meta_client::rpc::TableName; + use servers::query_handler::sql::SqlQueryHandler; use session::context::QueryContext; - use sql::parser::ParserContext; - use sql::statements::statement::Statement; use store_api::storage::RegionNumber; use table::metadata::{TableInfoBuilder, TableMetaBuilder}; use table::TableRef; @@ -215,10 +213,12 @@ mod test { let schema = Arc::new(Schema::new(column_schemas.clone())); let instance = crate::tests::create_distributed_instance(test_name).await; - let dist_instance = &instance.dist_instance; - let datanode_instances = instance.datanodes; - - let catalog_manager = dist_instance.catalog_manager(); + let frontend = instance.frontend(); + let catalog_manager = frontend + .catalog_manager() + .as_any() + .downcast_ref::() + .unwrap(); let partition_manager = catalog_manager.partition_manager(); let datanode_clients = catalog_manager.datanode_clients(); @@ -239,20 +239,10 @@ mod test { ) ENGINE=mito"; - let create_table = - match ParserContext::create_with_dialect(sql, &sqlparser::dialect::GenericDialect {}) - .unwrap() - .pop() - .unwrap() - { - Statement::CreateTable(c) => c, - _ => unreachable!(), - }; - - let mut expr = expr_factory::create_to_expr(&create_table, QueryContext::arc()).unwrap(); - let _result = dist_instance - .create_table(&mut expr, create_table.partitions) + let _result = frontend + .do_query(sql, QueryContext::arc()) .await + .remove(0) .unwrap(); let table_route = partition_manager @@ -276,7 +266,7 @@ mod test { ]; for (region_number, numbers) in regional_numbers { let datanode_id = *region_to_datanode_mapping.get(®ion_number).unwrap(); - let instance = datanode_instances.get(&datanode_id).unwrap().clone(); + let instance = instance.datanodes().get(&datanode_id).unwrap().clone(); let start_ts = global_start_ts; global_start_ts += numbers.len() as i64; diff --git a/tests-integration/src/test_util.rs b/tests-integration/src/test_util.rs index a63c2fb7aa..05bd12931e 100644 --- a/tests-integration/src/test_util.rs +++ b/tests-integration/src/test_util.rs @@ -104,10 +104,10 @@ fn s3_test_config() -> S3Config { } } -fn get_test_store_config( +pub fn get_test_store_config( store_type: &StorageType, name: &str, -) -> (ObjectStoreConfig, Option) { +) -> (ObjectStoreConfig, TempDirGuard) { let _ = dotenv::dotenv(); match store_type { @@ -133,10 +133,7 @@ fn get_test_store_config( let store = ObjectStore::new(builder).unwrap().finish(); - ( - config, - Some(TempDirGuard::Oss(TempFolder::new(&store, "/"))), - ) + (config, TempDirGuard::Oss(TempFolder::new(&store, "/"))) } StorageType::S3 | StorageType::S3WithCache => { let mut s3_config = s3_test_config(); @@ -163,7 +160,7 @@ fn get_test_store_config( let store = ObjectStore::new(builder).unwrap().finish(); - (config, Some(TempDirGuard::S3(TempFolder::new(&store, "/")))) + (config, TempDirGuard::S3(TempFolder::new(&store, "/"))) } StorageType::File => { let data_tmp_dir = create_temp_dir(&format!("gt_data_{name}")); @@ -172,32 +169,31 @@ fn get_test_store_config( ObjectStoreConfig::File(FileConfig { data_dir: data_tmp_dir.path().to_str().unwrap().to_string(), }), - Some(TempDirGuard::File(data_tmp_dir)), + TempDirGuard::File(data_tmp_dir), ) } } } -enum TempDirGuard { +pub enum TempDirGuard { File(TempDir), S3(TempFolder), Oss(TempFolder), } -/// Create a tmp dir(will be deleted once it goes out of scope.) and a default `DatanodeOptions`, -/// Only for test. pub struct TestGuard { - _wal_tmp_dir: TempDir, - data_tmp_dir: Option, + pub wal_guard: WalGuard, + pub storage_guard: StorageGuard, } +pub struct WalGuard(pub TempDir); + +pub struct StorageGuard(pub TempDirGuard); + impl TestGuard { pub async fn remove_all(&mut self) { - if let Some(TempDirGuard::S3(mut guard)) = self.data_tmp_dir.take() { - guard.remove_all().await.unwrap(); - } - if let Some(TempDirGuard::Oss(mut guard)) = self.data_tmp_dir.take() { - guard.remove_all().await.unwrap(); + if let TempDirGuard::S3(guard) | TempDirGuard::Oss(guard) = &mut self.storage_guard.0 { + guard.remove_all().await.unwrap() } } } @@ -207,12 +203,24 @@ pub fn create_tmp_dir_and_datanode_opts( name: &str, ) -> (DatanodeOptions, TestGuard) { let wal_tmp_dir = create_temp_dir(&format!("gt_wal_{name}")); + let wal_dir = wal_tmp_dir.path().to_str().unwrap().to_string(); let (store, data_tmp_dir) = get_test_store_config(&store_type, name); + let opts = create_datanode_opts(store, wal_dir); - let opts = DatanodeOptions { + ( + opts, + TestGuard { + wal_guard: WalGuard(wal_tmp_dir), + storage_guard: StorageGuard(data_tmp_dir), + }, + ) +} + +pub fn create_datanode_opts(store: ObjectStoreConfig, wal_dir: String) -> DatanodeOptions { + DatanodeOptions { wal: WalConfig { - dir: wal_tmp_dir.path().to_str().unwrap().to_string(), + dir: wal_dir, ..Default::default() }, storage: StorageConfig { @@ -222,14 +230,7 @@ pub fn create_tmp_dir_and_datanode_opts( mode: Mode::Standalone, procedure: ProcedureConfig::default(), ..Default::default() - }; - ( - opts, - TestGuard { - _wal_tmp_dir: wal_tmp_dir, - data_tmp_dir, - }, - ) + } } pub async fn create_test_table( diff --git a/tests-integration/src/tests.rs b/tests-integration/src/tests.rs index f58cfca5bb..41bec41949 100644 --- a/tests-integration/src/tests.rs +++ b/tests-integration/src/tests.rs @@ -18,72 +18,53 @@ mod test_util; use std::collections::HashMap; use std::sync::Arc; -use std::time::Duration; -use api::v1::meta::Role; use catalog::local::{MemoryCatalogProvider, MemorySchemaProvider}; -use catalog::remote::{MetaKvBackend, RemoteCatalogManager}; -use client::Client; -use common_grpc::channel_manager::ChannelManager; -use common_runtime::Builder as RuntimeBuilder; -use common_test_util::temp_dir::{create_temp_dir, TempDir}; -use datanode::datanode::{ - DatanodeOptions, FileConfig, ObjectStoreConfig, ProcedureConfig, StorageConfig, WalConfig, -}; +use common_test_util::temp_dir::TempDir; use datanode::instance::Instance as DatanodeInstance; -use frontend::catalog::FrontendCatalogManager; -use frontend::datanode::DatanodeClients; -use frontend::instance::distributed::DistInstance; use frontend::instance::Instance; -use meta_client::client::MetaClientBuilder; -use meta_client::rpc::Peer; -use meta_srv::metasrv::MetaSrvOptions; -use meta_srv::mocks::MockInfo; -use meta_srv::service::store::kv::KvStoreRef; -use meta_srv::service::store::memory::MemStore; -use partition::manager::PartitionRuleManager; -use partition::route::TableRoutes; -use servers::grpc::GrpcServer; -use servers::query_handler::grpc::ServerGrpcQueryHandlerAdaptor; -use servers::Mode; use table::engine::{region_name, table_dir}; -use tonic::transport::Server; -use tower::service_fn; -/// Guard against the `TempDir`s that used in unit tests. -/// (The `TempDir` will be deleted once it goes out of scope.) -pub struct TestGuard { - _wal_tmp_dir: TempDir, - _data_tmp_dir: TempDir, -} +use crate::cluster::{GreptimeDbCluster, GreptimeDbClusterBuilder}; +use crate::test_util::{create_tmp_dir_and_datanode_opts, StorageType, TempDirGuard, TestGuard}; -pub(crate) struct MockDistributedInstance { - pub(crate) frontend: Arc, - pub(crate) dist_instance: Arc, - pub(crate) datanodes: HashMap>, - pub(crate) catalog_manager: Arc, - _guards: Vec, -} +pub struct MockDistributedInstance(GreptimeDbCluster); impl MockDistributedInstance { pub fn data_tmp_dirs(&self) -> Vec<&TempDir> { - self._guards.iter().map(|g| &g._data_tmp_dir).collect() + self.0 + .storage_guards + .iter() + .map(|g| { + let TempDirGuard::File(dir) = &g.0 else { unreachable!() }; + dir + }) + .collect() + } + + pub fn frontend(&self) -> Arc { + self.0.frontend.clone() + } + + pub fn datanodes(&self) -> &HashMap> { + &self.0.datanode_instances } } -pub(crate) struct MockStandaloneInstance { - pub(crate) instance: Arc, +pub struct MockStandaloneInstance { + pub instance: Arc, _guard: TestGuard, } impl MockStandaloneInstance { pub fn data_tmp_dir(&self) -> &TempDir { - &self._guard._data_tmp_dir + let TempDirGuard::File(dir) = &self._guard.storage_guard.0 else { unreachable!() }; + dir } } -pub(crate) async fn create_standalone_instance(test_name: &str) -> MockStandaloneInstance { - let (opts, guard) = create_tmp_dir_and_datanode_opts(test_name); +pub async fn create_standalone_instance(test_name: &str) -> MockStandaloneInstance { + let (opts, guard) = create_tmp_dir_and_datanode_opts(StorageType::File, test_name); let dn_instance = Arc::new(DatanodeInstance::new(&opts).await.unwrap()); let frontend_instance = Instance::try_new_standalone(dn_instance.clone()) .await @@ -110,221 +91,9 @@ pub(crate) async fn create_standalone_instance(test_name: &str) -> MockStandalon } } -fn create_tmp_dir_and_datanode_opts(name: &str) -> (DatanodeOptions, TestGuard) { - let wal_tmp_dir = create_temp_dir(&format!("gt_wal_{name}")); - let data_tmp_dir = create_temp_dir(&format!("gt_data_{name}")); - let opts = DatanodeOptions { - wal: WalConfig { - dir: wal_tmp_dir.path().to_str().unwrap().to_string(), - ..Default::default() - }, - storage: StorageConfig { - store: ObjectStoreConfig::File(FileConfig { - data_dir: data_tmp_dir.path().to_str().unwrap().to_string(), - }), - ..Default::default() - }, - mode: Mode::Standalone, - procedure: ProcedureConfig::default(), - ..Default::default() - }; - ( - opts, - TestGuard { - _wal_tmp_dir: wal_tmp_dir, - _data_tmp_dir: data_tmp_dir, - }, - ) -} - -pub(crate) async fn create_datanode_client( - datanode_instance: Arc, -) -> (String, Client) { - let (client, server) = tokio::io::duplex(1024); - - let runtime = Arc::new( - RuntimeBuilder::default() - .worker_threads(2) - .thread_name("grpc-handlers") - .build() - .unwrap(), - ); - - // create a mock datanode grpc service, see example here: - // https://github.com/hyperium/tonic/blob/master/examples/src/mock/mock.rs - let grpc_server = GrpcServer::new( - ServerGrpcQueryHandlerAdaptor::arc(datanode_instance), - None, - None, - runtime, - ); - tokio::spawn(async move { - Server::builder() - .add_service(grpc_server.create_flight_service()) - .add_service(grpc_server.create_database_service()) - .serve_with_incoming(futures::stream::iter(vec![Ok::<_, std::io::Error>(server)])) - .await - }); - - // Move client to an option so we can _move_ the inner value - // on the first attempt to connect. All other attempts will fail. - let mut client = Some(client); - // "127.0.0.1:3001" is just a placeholder, does not actually connect to it. - let addr = "127.0.0.1:3001"; - let channel_manager = ChannelManager::new(); - channel_manager - .reset_with_connector( - addr, - service_fn(move |_| { - let client = client.take(); - - async move { - if let Some(client) = client { - Ok(client) - } else { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "Client already taken", - )) - } - } - }), - ) - .unwrap(); - ( - addr.to_string(), - Client::with_manager_and_urls(channel_manager, vec![addr]), - ) -} - -async fn create_distributed_datanode( - test_name: &str, - datanode_id: u64, - meta_srv: MockInfo, -) -> (Arc, TestGuard) { - let wal_tmp_dir = create_temp_dir(&format!("gt_wal_{test_name}_dist_dn_{datanode_id}")); - let data_tmp_dir = create_temp_dir(&format!("gt_data_{test_name}_dist_dn_{datanode_id}")); - let opts = DatanodeOptions { - node_id: Some(datanode_id), - wal: WalConfig { - dir: wal_tmp_dir.path().to_str().unwrap().to_string(), - ..Default::default() - }, - storage: StorageConfig { - store: ObjectStoreConfig::File(FileConfig { - data_dir: data_tmp_dir.path().to_str().unwrap().to_string(), - }), - ..Default::default() - }, - mode: Mode::Distributed, - procedure: ProcedureConfig::default(), - ..Default::default() - }; - - let instance = Arc::new( - DatanodeInstance::with_mock_meta_server(&opts, meta_srv) - .await - .unwrap(), - ); - instance.start().await.unwrap(); - - // create another catalog and schema for testing - let _ = instance - .catalog_manager() - .as_any() - .downcast_ref::() - .unwrap() - .create_catalog_and_schema("another_catalog", "another_schema") - .await - .unwrap(); - - ( - instance, - TestGuard { - _wal_tmp_dir: wal_tmp_dir, - _data_tmp_dir: data_tmp_dir, - }, - ) -} - -async fn wait_datanodes_alive(kv_store: KvStoreRef) { - let wait = 10; - for _ in 0..wait { - let datanodes = meta_srv::lease::alive_datanodes(1000, &kv_store, |_, _| true) - .await - .unwrap(); - if datanodes.len() >= 4 { - return; - } - tokio::time::sleep(Duration::from_secs(1)).await - } - panic!() -} - -pub(crate) async fn create_distributed_instance(test_name: &str) -> MockDistributedInstance { - let kv_store: KvStoreRef = Arc::new(MemStore::default()) as _; - let meta_srv = meta_srv::mocks::mock(MetaSrvOptions::default(), kv_store.clone(), None).await; - - let datanode_clients = Arc::new(DatanodeClients::default()); - - let mut test_guards = vec![]; - - let mut datanode_instances = HashMap::new(); - for datanode_id in 1..=4 { - let (dn_instance, guard) = - create_distributed_datanode(test_name, datanode_id, meta_srv.clone()).await; - datanode_instances.insert(datanode_id, dn_instance.clone()); - - test_guards.push(guard); - - let (addr, client) = create_datanode_client(dn_instance).await; - datanode_clients - .insert_client(Peer::new(datanode_id, addr), client) - .await; - } - - let MockInfo { - server_addr, - channel_manager, - } = meta_srv.clone(); - let mut meta_client = MetaClientBuilder::new(1000, 0, Role::Frontend) - .enable_router() - .enable_store() - .channel_manager(channel_manager) - .build(); - meta_client.start(&[&server_addr]).await.unwrap(); - let meta_client = Arc::new(meta_client); - - let meta_backend = Arc::new(MetaKvBackend { - client: meta_client.clone(), - }); - let partition_manager = Arc::new(PartitionRuleManager::new(Arc::new(TableRoutes::new( - meta_client.clone(), - )))); - let mut catalog_manager = - FrontendCatalogManager::new(meta_backend, partition_manager, datanode_clients.clone()); - - wait_datanodes_alive(kv_store).await; - - let dist_instance = DistInstance::new( - meta_client.clone(), - Arc::new(catalog_manager.clone()), - datanode_clients.clone(), - ); - let dist_instance = Arc::new(dist_instance); - - catalog_manager.set_dist_instance(dist_instance.clone()); - let catalog_manager = Arc::new(catalog_manager); - - let frontend = Instance::new_distributed(catalog_manager.clone(), dist_instance.clone()).await; - - MockDistributedInstance { - frontend: Arc::new(frontend), - dist_instance, - datanodes: datanode_instances, - catalog_manager, - _guards: test_guards, - } +pub async fn create_distributed_instance(test_name: &str) -> MockDistributedInstance { + let cluster = GreptimeDbClusterBuilder::new(test_name).build().await; + MockDistributedInstance(cluster) } pub fn test_region_dir( diff --git a/tests-integration/src/tests/test_util.rs b/tests-integration/src/tests/test_util.rs index d5a4d56aaa..041a9b150d 100644 --- a/tests-integration/src/tests/test_util.rs +++ b/tests-integration/src/tests/test_util.rs @@ -43,7 +43,7 @@ impl MockInstance for MockStandaloneInstance { impl MockInstance for MockDistributedInstance { fn frontend(&self) -> Arc { - self.frontend.clone() + self.frontend() } fn is_distributed_mode(&self) -> bool { diff --git a/tests-integration/tests/region_failover.rs b/tests-integration/tests/region_failover.rs new file mode 100644 index 0000000000..706fdfd1a6 --- /dev/null +++ b/tests-integration/tests/region_failover.rs @@ -0,0 +1,145 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use catalog::helper::TableGlobalKey; +use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, MITO_ENGINE}; +use common_meta::RegionIdent; +use common_procedure::{watcher, ProcedureWithId}; +use common_telemetry::info; +use meta_srv::metasrv::SelectorContext; +use meta_srv::procedure::region_failover::{RegionFailoverContext, RegionFailoverProcedure}; +use meta_srv::table_routes; +use servers::query_handler::sql::SqlQueryHandler; +use session::context::QueryContext; +use tests_integration::cluster::{GreptimeDbCluster, GreptimeDbClusterBuilder}; +use tests_integration::test_util::{get_test_store_config, StorageType}; + +// TODO(LFC): wait for close regions in datanode, and read/write route in frontend ready +#[tokio::test(flavor = "multi_thread")] +async fn test_region_failover() { + common_telemetry::init_default_ut_logging(); + + let cluster_name = "test_region_failover"; + + let (store_config, _guard) = get_test_store_config(&StorageType::File, cluster_name); + + let cluster = GreptimeDbClusterBuilder::new(cluster_name) + .with_datanodes(2) + .with_store_config(store_config) + .build() + .await; + + prepare_testing_table(&cluster).await; + + let distribution = find_region_distribution(&cluster).await; + info!("Find region distribution: {distribution:?}"); + + let failed_region = choose_failed_region(distribution); + info!("Simulating failed region: {failed_region:#?}"); + + run_region_failover_procedure(&cluster, failed_region.clone()).await; + + let mut distribution = find_region_distribution(&cluster).await; + info!("Find region distribution again: {distribution:?}"); + + assert!(!distribution + .remove(&failed_region.datanode_id) + .unwrap() + .contains(&failed_region.region_number)); + // Since there are only two datanodes, the other datanode is the candidate. + assert!(distribution + .values() + .next() + .unwrap() + .contains(&failed_region.region_number)); +} + +async fn prepare_testing_table(cluster: &GreptimeDbCluster) { + let sql = r" +CREATE TABLE my_table ( + i INT PRIMARY KEY, + ts TIMESTAMP TIME INDEX, +) PARTITION BY RANGE COLUMNS (i) ( + PARTITION r0 VALUES LESS THAN (10), + PARTITION r1 VALUES LESS THAN (20), + PARTITION r2 VALUES LESS THAN (50), + PARTITION r3 VALUES LESS THAN (MAXVALUE), +)"; + let result = cluster.frontend.do_query(sql, QueryContext::arc()).await; + assert!(result[0].is_ok()); +} + +async fn find_region_distribution(cluster: &GreptimeDbCluster) -> HashMap> { + let key = TableGlobalKey { + catalog_name: DEFAULT_CATALOG_NAME.to_string(), + schema_name: DEFAULT_SCHEMA_NAME.to_string(), + table_name: "my_table".to_string(), + }; + let value = table_routes::get_table_global_value(&cluster.kv_store, &key) + .await + .unwrap() + .unwrap(); + value.regions_id_map +} + +fn choose_failed_region(distribution: HashMap>) -> RegionIdent { + let (failed_datanode, failed_region) = distribution + .iter() + .filter_map(|(datanode_id, regions)| { + if !regions.is_empty() { + Some((*datanode_id, regions[0])) + } else { + None + } + }) + .next() + .unwrap(); + RegionIdent { + cluster_id: 1000, + datanode_id: failed_datanode, + table_id: 1025, + engine: MITO_ENGINE.to_string(), + catalog: DEFAULT_CATALOG_NAME.to_string(), + schema: DEFAULT_SCHEMA_NAME.to_string(), + table: "my_table".to_string(), + region_number: failed_region, + } +} + +async fn run_region_failover_procedure(cluster: &GreptimeDbCluster, failed_region: RegionIdent) { + let meta_srv = &cluster.meta_srv; + let procedure_manager = meta_srv.procedure_manager(); + let procedure = RegionFailoverProcedure::new( + failed_region.clone(), + RegionFailoverContext { + mailbox: meta_srv.mailbox(), + selector: meta_srv.selector(), + selector_ctx: SelectorContext { + datanode_lease_secs: meta_srv.options().datanode_lease_secs, + server_addr: meta_srv.options().server_addr.clone(), + kv_store: meta_srv.kv_store(), + catalog: None, + schema: None, + }, + }, + ); + let procedure_with_id = ProcedureWithId::with_random_id(Box::new(procedure)); + let procedure_id = procedure_with_id.id; + info!("Starting region failover procedure {procedure_id} for region {failed_region:?}"); + + let watcher = &mut procedure_manager.submit(procedure_with_id).await.unwrap(); + watcher::wait(watcher).await.unwrap(); +}