From dcb24ce170573a2ae6ed29467669d03c73b589e6 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Dec 2024 12:35:59 +0100 Subject: [PATCH] safekeeper,pageserver: add heap profiling (#9778) ## Problem We don't have good observability for memory usage. This would be useful e.g. to debug OOM incidents or optimize performance or resource usage. We would also like to use continuous profiling with e.g. [Grafana Cloud Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/) (see https://github.com/neondatabase/cloud/issues/14888). This PR is intended as a proof of concept, to try it out in staging and drive further discussions about profiling more broadly. Touches https://github.com/neondatabase/neon/issues/9534. Touches https://github.com/neondatabase/cloud/issues/14888. Depends on #9779. Depends on #9780. ## Summary of changes Adds a HTTP route `/profile/heap` that takes a heap profile and returns it. Query parameters: * `format`: output format (`jemalloc` or `pprof`; default `pprof`). Unlike CPU profiles (see #9764), heap profiles are not symbolized and require the original binary to translate addresses to function names. To make this work with Grafana, we'll probably have to symbolize the process server-side -- this is left as future work, as is other output formats like SVG. Heap profiles don't work on macOS due to limitations in jemalloc. --- Cargo.lock | 89 ++++++++++++++++++++++++------- Cargo.toml | 3 +- libs/utils/Cargo.toml | 1 + libs/utils/src/http/endpoint.rs | 64 ++++++++++++++++++++++ pageserver/src/bin/pageserver.rs | 5 ++ pageserver/src/http/routes.rs | 8 +-- safekeeper/benches/receive_wal.rs | 6 +++ safekeeper/src/bin/safekeeper.rs | 5 ++ safekeeper/src/http/routes.rs | 7 ++- workspace_hack/Cargo.toml | 15 ++++-- 10 files changed, 175 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba02e3b11d..b2769e59f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,7 +301,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "http 0.2.9", "hyper 0.14.30", @@ -341,7 +341,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "http 0.2.9", "http-body 0.4.5", "once_cell", @@ -417,7 +417,7 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "hmac", "http 0.2.9", @@ -621,7 +621,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", @@ -2054,9 +2054,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "ff" @@ -2912,6 +2912,23 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jemalloc_pprof" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3022,9 +3039,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libloading" @@ -3044,9 +3061,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "linux-raw-sys" @@ -3079,6 +3096,19 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "mappings" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.1.0" @@ -3346,6 +3376,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" dependencies = [ + "num-bigint", "num-complex", "num-integer", "num-iter", @@ -3434,6 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", + "num-bigint", "num-integer", "num-traits", ] @@ -3497,9 +3529,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "oorandom" @@ -4298,6 +4330,19 @@ dependencies = [ "thiserror", ] +[[package]] +name = "pprof_util" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +dependencies = [ + "anyhow", + "flate2", + "num", + "paste", + "prost", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -5220,14 +5265,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys 0.4.13", + "linux-raw-sys 0.4.14", "windows-sys 0.52.0", ] @@ -6251,13 +6296,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", - "fastrand 2.0.0", - "redox_syscall 0.4.1", + "fastrand 2.2.0", + "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -7058,6 +7103,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.30", + "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -7644,8 +7690,12 @@ dependencies = [ "memchr", "nix 0.26.4", "nom", + "num", "num-bigint", + "num-complex", "num-integer", + "num-iter", + "num-rational", "num-traits", "once_cell", "parquet", @@ -7669,6 +7719,7 @@ dependencies = [ "subtle", "syn 2.0.90", "sync_wrapper 0.1.2", + "tikv-jemalloc-ctl", "tikv-jemalloc-sys", "time", "time-macros", diff --git a/Cargo.toml b/Cargo.toml index 036dc01057..91fa6a2607 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -115,6 +115,7 @@ indoc = "2" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" +jemalloc_pprof = "0.6" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -175,7 +176,7 @@ sync_wrapper = "0.1.2" tar = "0.4" test-context = "0.3" thiserror = "1.0" -tikv-jemallocator = { version = "0.6", features = ["stats"] } +tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 5648072a83..66500fb141 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -26,6 +26,7 @@ humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} +jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 6a85f0ddeb..d975b63677 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; @@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A } } +/// Generates heap profiles. +/// +/// This only works with jemalloc on Linux. +pub async fn profile_heap_handler(req: Request) -> Result, ApiError> { + enum Format { + Jemalloc, + Pprof, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("jemalloc") => Format::Jemalloc, + Some("pprof") => Format::Pprof, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + + // Obtain profiler handle. + let mut prof_ctl = jemalloc_pprof::PROF_CTL + .as_ref() + .ok_or(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + )))? + .lock() + .await; + if !prof_ctl.activated() { + return Err(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + ))); + } + + // Take and return the profile. + match format { + Format::Jemalloc => { + // NB: file is an open handle to a tempfile that's already deleted. + let file = tokio::task::spawn_blocking(move || prof_ctl.dump()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + let stream = ReaderStream::new(tokio::fs::File::from_std(file)); + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"") + .body(Body::wrap_stream(stream)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Pprof => { + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .body(Body::from(data)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 31f4370855..8fe225c6aa 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ceb1c3b012..e127871549 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::profile_cpu_handler; -use utils::http::endpoint::prometheus_metrics_handler; -use utils::http::endpoint::request_span; +use utils::http::endpoint::{ + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, +}; use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -155,6 +155,7 @@ impl State { "/swagger.yml", "/metrics", "/profile/cpu", + "/profile/heap", ]; Ok(Self { conf, @@ -3203,6 +3204,7 @@ pub fn make_router( .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 8c4281cf52..313d945b94 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -24,9 +24,15 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; +/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. +/// This mirrors the configuration in bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + // Register benchmarks with Criterion. criterion_group!( name = benches; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3659bcd7e0..4dc7edef37 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,6 +51,11 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 28294abdb9..69b775fd76 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::{ - profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + ChannelWriter, }; use utils::http::request::parse_query_param; @@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + const ALLOWLIST_ROUTES: &[&str] = + &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { @@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index c0a3abc377..d19379aefd 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -55,12 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } @@ -76,7 +80,8 @@ smallvec = { version = "1", default-features = false, features = ["const_new", " spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } -tikv-jemalloc-sys = { version = "0.6", features = ["stats"] } +tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] } +tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } @@ -111,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }