feat: Add HTTP API for cpu profiling (#1694)

* chore: print source error in mem-prof

* feat(common-pprof): add pprof crate

* feat(servers): Add pprof handler to router

refactor the mem_prof handler to avoid checking feature while
registering router

* feat(servers): pprof handler support different output type

* docs(common-pprof): Add readme

* feat(common-pprof): Build guard using code in pprof-rs's example

* feat(common-pprof): use prost

* feat: don't add timeout to perf api

* feat: add feature pprof

* feat: update readme

* test: fix tests

* feat: close region in TestBase

* feat(pprof): addres comments
This commit is contained in:
Yingwen
2023-06-07 15:25:16 +08:00
committed by GitHub
parent 8cda1635cc
commit 5b8e54e60e
12 changed files with 467 additions and 19 deletions

149
Cargo.lock generated
View File

@@ -1816,6 +1816,17 @@ dependencies = [
"tokio",
]
[[package]]
name = "common-pprof"
version = "0.4.0"
dependencies = [
"common-error",
"pprof",
"prost",
"snafu",
"tokio",
]
[[package]]
name = "common-procedure"
version = "0.4.0"
@@ -2078,6 +2089,15 @@ version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
[[package]]
name = "cpp_demangle"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c76f98bdfc7f66172e6c7065f981ebb576ffc903fe4c0561d9f0c2509226dc6"
dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "cpufeatures"
version = "0.2.7"
@@ -2655,6 +2675,15 @@ dependencies = [
"snafu",
]
[[package]]
name = "debugid"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
dependencies = [
"uuid",
]
[[package]]
name = "der"
version = "0.5.1"
@@ -3099,6 +3128,18 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "findshlibs"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
dependencies = [
"cc",
"lazy_static",
"libc",
"winapi",
]
[[package]]
name = "fixedbitset"
version = "0.4.2"
@@ -4401,6 +4442,24 @@ version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306"
[[package]]
name = "inferno"
version = "0.11.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fb7c1b80a1dfa604bb4a649a5c5aeef3d913f7c520cb42b40e534e8a61bcdfc"
dependencies = [
"ahash 0.8.3",
"indexmap",
"is-terminal",
"itoa",
"log",
"num-format",
"once_cell",
"quick-xml 0.26.0",
"rgb",
"str_stack",
]
[[package]]
name = "influxdb_line_protocol"
version = "0.1.0"
@@ -5643,6 +5702,16 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "num-format"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3"
dependencies = [
"arrayvec",
"itoa",
]
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -6519,6 +6588,32 @@ dependencies = [
"postgres-protocol",
]
[[package]]
name = "pprof"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "196ded5d4be535690899a4631cc9f18cdc41b7ebf24a79400f46f48e49a11059"
dependencies = [
"backtrace",
"cfg-if 1.0.0",
"findshlibs",
"inferno",
"libc",
"log",
"nix 0.26.2",
"once_cell",
"parking_lot 0.12.1",
"prost",
"prost-build",
"prost-derive",
"protobuf",
"sha2",
"smallvec",
"symbolic-demangle",
"tempfile",
"thiserror",
]
[[package]]
name = "ppv-lite86"
version = "0.2.17"
@@ -6987,6 +7082,15 @@ dependencies = [
"tokio-stream",
]
[[package]]
name = "quick-xml"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd"
dependencies = [
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.27.1"
@@ -7342,6 +7446,15 @@ dependencies = [
"thiserror",
]
[[package]]
name = "rgb"
version = "0.8.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20ec2d3e3fc7a92ced357df9cebd5a10b6fb2aa1ee797bf7e9ce2f17dffc8f59"
dependencies = [
"bytemuck",
]
[[package]]
name = "ring"
version = "0.16.20"
@@ -8414,6 +8527,7 @@ dependencies = [
"common-grpc",
"common-grpc-expr",
"common-mem-prof",
"common-pprof",
"common-query",
"common-recordbatch",
"common-runtime",
@@ -8942,6 +9056,12 @@ dependencies = [
"optional",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -9049,6 +9169,12 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e08d8363704e6c71fc928674353e6b7c23dcea9d82d7012c8faf2a3a025f8d0"
[[package]]
name = "str_stack"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb"
[[package]]
name = "streaming-stats"
version = "0.2.3"
@@ -9205,6 +9331,29 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
[[package]]
name = "symbolic-common"
version = "10.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b55cdc318ede251d0957f07afe5fed912119b8c1bc5a7804151826db999e737"
dependencies = [
"debugid",
"memmap2",
"stable_deref_trait",
"uuid",
]
[[package]]
name = "symbolic-demangle"
version = "10.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79be897be8a483a81fff6a3a4e195b4ac838ef73ca42d348b3f722da9902e489"
dependencies = [
"cpp_demangle",
"rustc-demangle",
"symbolic-common",
]
[[package]]
name = "syn"
version = "1.0.109"

View File

@@ -17,6 +17,7 @@ members = [
"src/common/meta",
"src/common/procedure",
"src/common/procedure-test",
"src/common/pprof",
"src/common/query",
"src/common/recordbatch",
"src/common/runtime",

View File

@@ -23,7 +23,7 @@ pub type Result<T> = std::result::Result<T, Error>;
#[derive(Debug, Snafu)]
#[snafu(visibility(pub))]
pub enum Error {
#[snafu(display("Failed to read OPT_PROF"))]
#[snafu(display("Failed to read OPT_PROF, source: {}", source))]
ReadOptProf { source: tikv_jemalloc_ctl::Error },
#[snafu(display("Memory profiling is not enabled"))]
@@ -32,13 +32,17 @@ pub enum Error {
#[snafu(display("Failed to build temp file from given path: {:?}", path))]
BuildTempPath { path: PathBuf, location: Location },
#[snafu(display("Failed to open temp file: {}", path))]
#[snafu(display("Failed to open temp file: {}, source: {}", path, source))]
OpenTempFile {
path: String,
source: std::io::Error,
},
#[snafu(display("Failed to dump profiling data to temp file: {:?}", path))]
#[snafu(display(
"Failed to dump profiling data to temp file: {:?}, source: {}",
path,
source
))]
DumpProfileData {
path: PathBuf,
source: tikv_jemalloc_ctl::Error,

View File

@@ -0,0 +1,16 @@
[package]
name = "common-pprof"
version.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
common-error = { path = "../error" }
pprof = { version = "0.11", features = [
"flamegraph",
"prost-codec",
"protobuf",
] }
prost.workspace = true
snafu.workspace = true
tokio.workspace = true

View File

@@ -0,0 +1,28 @@
# Profiling CPU
## Build GreptimeDB with `pprof` feature
```bash
cargo build --features=pprof
```
## HTTP API
Sample at 99 Hertz, for 5 seconds, output report in [protobuf format](https://github.com/google/pprof/blob/master/proto/profile.proto).
```bash
curl -s '0:4000/v1/prof/cpu' > /tmp/pprof.out
```
Then you can use `pprof` command with the protobuf file.
```bash
go tool pprof -top /tmp/pprof.out
```
Sample at 99 Hertz, for 60 seconds, output report in flamegraph format.
```bash
curl -s '0:4000/v1/prof/cpu?seconds=60&output=flamegraph' > /tmp/pprof.svg
```
Sample at 49 Hertz, for 10 seconds, output report in text format.
```bash
curl -s '0:4000/v1/prof/cpu?seconds=10&frequency=49&output=text' > /tmp/pprof.txt
```

124
src/common/pprof/src/lib.rs Normal file
View File

@@ -0,0 +1,124 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::time::Duration;
use common_error::prelude::{ErrorExt, StatusCode};
use prost::Message;
use snafu::{Location, ResultExt, Snafu};
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display(
"Failed to create profiler guard, source: {}, location: {}",
source,
location
))]
CreateGuard {
source: pprof::Error,
location: Location,
},
#[snafu(display("Failed to create report, source: {}, location: {}", source, location))]
CreateReport {
source: pprof::Error,
location: Location,
},
#[snafu(display(
"Failed to create flamegraph, source: {}, location: {}",
source,
location
))]
CreateFlamegraph {
source: pprof::Error,
location: Location,
},
#[snafu(display(
"Failed to create pprof report, source: {}, location: {}",
source,
location
))]
ReportPprof {
source: pprof::Error,
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
StatusCode::Unexpected
}
fn as_any(&self) -> &dyn Any {
self
}
}
/// CPU profiler utility.
// Inspired by https://github.com/datafuselabs/databend/blob/67f445e83cd4eceda98f6c1c114858929d564029/src/common/base/src/base/profiling.rs
#[derive(Debug)]
pub struct Profiling {
/// Sample duration.
duration: Duration,
/// Sample frequency.
frequency: i32,
}
impl Profiling {
/// Creates a new profiler.
pub fn new(duration: Duration, frequency: i32) -> Profiling {
Profiling {
duration,
frequency,
}
}
/// Profiles and returns a generated pprof report.
pub async fn report(&self) -> Result<pprof::Report> {
let guard = pprof::ProfilerGuardBuilder::default()
.frequency(self.frequency)
.blocklist(&["libc", "libgcc", "pthread", "vdso"])
.build()
.context(CreateGuardSnafu)?;
tokio::time::sleep(self.duration).await;
guard.report().build().context(CreateReportSnafu)
}
/// Profiles and returns a generated flamegraph.
pub async fn dump_flamegraph(&self) -> Result<Vec<u8>> {
let mut body: Vec<u8> = Vec::new();
let report = self.report().await?;
report
.flamegraph(&mut body)
.context(CreateFlamegraphSnafu)?;
Ok(body)
}
/// Profiles and returns a generated proto.
pub async fn dump_proto(&self) -> Result<Vec<u8>> {
let report = self.report().await?;
// Generate googles pprof format report.
let profile = report.pprof().context(ReportPprofSnafu)?;
let body = profile.encode_to_vec();
Ok(body)
}
}

View File

@@ -5,6 +5,7 @@ edition.workspace = true
license.workspace = true
[features]
pprof = ["dep:common-pprof"]
mem-prof = ["dep:common-mem-prof"]
dashboard = []
@@ -25,6 +26,7 @@ common-error = { path = "../common/error" }
common-grpc = { path = "../common/grpc" }
common-grpc-expr = { path = "../common/grpc-expr" }
common-mem-prof = { path = "../common/mem-prof", optional = true }
common-pprof = { path = "../common/pprof", optional = true }
common-query = { path = "../common/query" }
common-recordbatch = { path = "../common/recordbatch" }
common-runtime = { path = "../common/runtime" }

View File

@@ -267,6 +267,13 @@ pub enum Error {
location: Location,
},
#[cfg(feature = "pprof")]
#[snafu(display("Failed to dump pprof data, source: {}", source))]
DumpPprof {
#[snafu(backtrace)]
source: common_pprof::Error,
},
#[snafu(display("Failed to update jemalloc metrics, source: {source}, location: {location}"))]
UpdateJemallocMetrics {
source: tikv_jemalloc_ctl::Error,
@@ -347,6 +354,10 @@ impl ErrorExt for Error {
StatusCode::Unknown
}
}
#[cfg(feature = "pprof")]
DumpPprof { source, .. } => source.status_code(),
UpdateJemallocMetrics { .. } => StatusCode::Internal,
}
}

View File

@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
mod admin;
pub mod authorize;
pub mod handler;
pub mod influxdb;
pub mod mem_prof;
pub mod opentsdb;
mod pprof;
pub mod prometheus;
pub mod script;
mod admin;
#[cfg(feature = "dashboard")]
mod dashboard;
#[cfg(feature = "mem-prof")]
pub mod mem_prof;
use std::net::SocketAddr;
use std::sync::Arc;
@@ -503,15 +503,6 @@ impl HttpServer {
);
}
// mem profiler
#[cfg(feature = "mem-prof")]
{
router = router.nest(
&format!("/{HTTP_API_VERSION}/prof"),
Router::new().route("/mem", routing::get(crate::http::mem_prof::mem_prof)),
);
}
if let Some(metrics_handler) = self.metrics_handler {
router = router.nest("", self.route_metrics(metrics_handler));
}
@@ -556,6 +547,19 @@ impl HttpServer {
HttpAuth::<BoxBody>::new(self.user_provider.clone()),
)),
)
// Handlers for debug, we don't expect a timeout.
.nest(
&format!("/{HTTP_API_VERSION}/prof"),
Router::new()
.route(
"/cpu",
routing::get(pprof::pprof_handler).post(pprof::pprof_handler),
)
.route(
"/mem",
routing::get(mem_prof::mem_prof_handler).post(mem_prof::mem_prof_handler),
),
)
}
fn route_metrics<S>(&self, metrics_handler: MetricsHandler) -> Router<S> {

View File

@@ -14,13 +14,14 @@
use axum::http::StatusCode;
use axum::response::IntoResponse;
use snafu::ResultExt;
use crate::error::DumpProfileDataSnafu;
#[cfg(feature = "mem-prof")]
#[axum_macros::debug_handler]
pub async fn mem_prof() -> crate::error::Result<impl IntoResponse> {
pub async fn mem_prof_handler() -> crate::error::Result<impl IntoResponse> {
use snafu::ResultExt;
use crate::error::DumpProfileDataSnafu;
Ok((
StatusCode::OK,
common_mem_prof::dump_profile()
@@ -28,3 +29,12 @@ pub async fn mem_prof() -> crate::error::Result<impl IntoResponse> {
.context(DumpProfileDataSnafu)?,
))
}
#[cfg(not(feature = "mem-prof"))]
#[axum_macros::debug_handler]
pub async fn mem_prof_handler() -> crate::error::Result<impl IntoResponse> {
Ok((
StatusCode::NOT_IMPLEMENTED,
"The 'mem-prof' feature is disabled",
))
}

View File

@@ -0,0 +1,98 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#[cfg(feature = "pprof")]
pub mod handler {
use std::num::NonZeroI32;
use std::time::Duration;
use axum::extract::Query;
use axum::http::StatusCode;
use axum::response::IntoResponse;
use common_pprof::Profiling;
use common_telemetry::logging;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use crate::error::{DumpPprofSnafu, Result};
/// Output format.
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "snake_case")]
pub enum Output {
/// googles pprof format report in protobuf.
Proto,
/// Simple text format.
Text,
/// svg flamegraph.
Flamegraph,
}
#[derive(Serialize, Deserialize, Debug, JsonSchema)]
#[serde(default)]
pub struct PprofQuery {
seconds: u64,
frequency: NonZeroI32,
output: Output,
}
impl Default for PprofQuery {
fn default() -> PprofQuery {
PprofQuery {
seconds: 5,
// Safety: 99 is non zero.
frequency: NonZeroI32::new(99).unwrap(),
output: Output::Proto,
}
}
}
#[axum_macros::debug_handler]
pub async fn pprof_handler(Query(req): Query<PprofQuery>) -> Result<impl IntoResponse> {
logging::info!("start pprof, request: {:?}", req);
let profiling = Profiling::new(Duration::from_secs(req.seconds), req.frequency.into());
let body = match req.output {
Output::Proto => profiling.dump_proto().await.context(DumpPprofSnafu)?,
Output::Text => {
let report = profiling.report().await.context(DumpPprofSnafu)?;
format!("{:?}", report).into_bytes()
}
Output::Flamegraph => profiling.dump_flamegraph().await.context(DumpPprofSnafu)?,
};
logging::info!("finish pprof");
Ok((StatusCode::OK, body))
}
}
#[cfg(not(feature = "pprof"))]
pub mod handler {
use axum::http::StatusCode;
use axum::response::IntoResponse;
use crate::error::Result;
#[axum_macros::debug_handler]
pub async fn pprof_handler() -> Result<impl IntoResponse> {
Ok((
StatusCode::NOT_IMPLEMENTED,
"The 'pprof' feature is disabled",
))
}
}
pub use handler::pprof_handler;

View File

@@ -94,6 +94,7 @@ impl<S: LogStore> TesterBase<S> {
}
pub async fn close(&self) {
self.region.close(&CloseContext::default()).await.unwrap();
self.region.inner.wal.close().await.unwrap();
}