feat: add InformationExtension.inspect_datanode for datanode inspection (#6921)

* feat: add InformationExtension.inspect_datanode for datanode inspection

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* aggregate results from all datanodes

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix fmt

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix unreleased mito engine

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
This commit is contained in:
Zhenchi
2025-09-09 11:29:04 +08:00
committed by GitHub
parent 9fe84f6fbd
commit 264d05d20e
10 changed files with 653 additions and 66 deletions

View File

@@ -341,8 +341,18 @@ impl StartCommand {
.build(),
);
let information_extension =
Arc::new(DistributedInformationExtension::new(meta_client.clone()));
// flownode's frontend to datanode need not timeout.
// Some queries are expected to take long time.
let channel_config = ChannelConfig {
timeout: None,
..Default::default()
};
let client = Arc::new(NodeClients::new(channel_config));
let information_extension = Arc::new(DistributedInformationExtension::new(
meta_client.clone(),
client.clone(),
));
let catalog_manager = KvBackendCatalogManagerBuilder::new(
information_extension,
cached_meta_backend.clone(),
@@ -398,14 +408,6 @@ impl StartCommand {
flownode.setup_services(services);
let flownode = flownode;
// flownode's frontend to datanode need not timeout.
// Some queries are expected to take long time.
let channel_config = ChannelConfig {
timeout: None,
..Default::default()
};
let client = Arc::new(NodeClients::new(channel_config));
let invoker = FrontendInvoker::build_from(
flownode.flow_engine().streaming_engine(),
catalog_manager.clone(),

View File

@@ -378,8 +378,24 @@ impl StartCommand {
.build(),
);
let information_extension =
Arc::new(DistributedInformationExtension::new(meta_client.clone()));
// frontend to datanode need not timeout.
// Some queries are expected to take long time.
let mut channel_config = ChannelConfig {
timeout: None,
tcp_nodelay: opts.datanode.client.tcp_nodelay,
connect_timeout: Some(opts.datanode.client.connect_timeout),
..Default::default()
};
if opts.grpc.flight_compression.transport_compression() {
channel_config.accept_compression = true;
channel_config.send_compression = true;
}
let client = Arc::new(NodeClients::new(channel_config));
let information_extension = Arc::new(DistributedInformationExtension::new(
meta_client.clone(),
client.clone(),
));
let process_manager = Arc::new(ProcessManager::new(
addrs::resolve_addr(&opts.grpc.bind_addr, Some(&opts.grpc.server_addr)),
@@ -413,26 +429,12 @@ impl StartCommand {
);
let heartbeat_task = Some(heartbeat_task);
// frontend to datanode need not timeout.
// Some queries are expected to take long time.
let mut channel_config = ChannelConfig {
timeout: None,
tcp_nodelay: opts.datanode.client.tcp_nodelay,
connect_timeout: Some(opts.datanode.client.connect_timeout),
..Default::default()
};
if opts.grpc.flight_compression.transport_compression() {
channel_config.accept_compression = true;
channel_config.send_compression = true;
}
let client = NodeClients::new(channel_config);
let instance = FrontendBuilder::new(
opts.clone(),
cached_meta_backend.clone(),
layered_cache_registry.clone(),
catalog_manager,
Arc::new(client),
client,
meta_client,
process_manager,
)

View File

@@ -19,10 +19,11 @@ use std::{fs, path};
use async_trait::async_trait;
use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry};
use catalog::information_schema::InformationExtension;
use catalog::information_schema::{DatanodeInspectRequest, InformationExtension};
use catalog::kvbackend::KvBackendCatalogManagerBuilder;
use catalog::process_manager::ProcessManager;
use clap::Parser;
use client::SendableRecordBatchStream;
use client::api::v1::meta::RegionRole;
use common_base::Plugins;
use common_base::readable_size::ReadableSize;
@@ -48,6 +49,7 @@ use common_meta::sequence::SequenceBuilder;
use common_meta::wal_options_allocator::{WalOptionsAllocatorRef, build_wal_options_allocator};
use common_options::memory::MemoryOptions;
use common_procedure::{ProcedureInfo, ProcedureManagerRef};
use common_query::request::QueryRequest;
use common_telemetry::info;
use common_telemetry::logging::{
DEFAULT_LOGGING_DIR, LoggingOptions, SlowQueryOptions, TracingOptions,
@@ -80,6 +82,7 @@ use servers::grpc::GrpcOptions;
use servers::http::HttpOptions;
use servers::tls::{TlsMode, TlsOption};
use snafu::ResultExt;
use store_api::storage::RegionId;
use tokio::sync::RwLock;
use tracing_appender::non_blocking::WorkerGuard;
@@ -856,6 +859,25 @@ impl InformationExtension for StandaloneInformationExtension {
.await,
))
}
async fn inspect_datanode(
&self,
request: DatanodeInspectRequest,
) -> std::result::Result<SendableRecordBatchStream, Self::Error> {
let req = QueryRequest {
plan: request
.build_plan()
.context(catalog::error::DatafusionSnafu)?,
region_id: RegionId::default(),
header: None,
};
self.region_server
.handle_read(req)
.await
.map_err(BoxedError::new)
.context(catalog::error::InternalSnafu)
}
}
#[cfg(test)]