From 1ef4258f290b0401113fcabb800a4ef729cb6649 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 4 Apr 2025 14:41:28 +0100
Subject: [PATCH] pageserver: add tenant level performance tracing sampling
 ratio (#11433)

## Problem

https://github.com/neondatabase/neon/pull/11140 introduces performance
tracing with OTEL
and a pageserver config which configures the sampling ratio of get page
requests.

Enabling a non-zero sampling ratio on a per region basis is too
aggressive and comes with perf
impact that isn't very well understood yet.

## Summary of changes

Add a `sampling_ratio` tenant level config which overrides the
pageserver level config.
Note that we do not cache the config and load it on every get page
request such that changes propagate
timely.

Note that I've had to remove the `SHARD_SELECTION` span to get this to
work. The tracing library doesn't
expose a neat way to drop a span if one realises it's not needed at
runtime.

Closes https://github.com/neondatabase/neon/issues/11392
---
 control_plane/src/pageserver.rs               |  5 ++
 libs/pageserver_api/src/config.rs             |  6 +-
 libs/pageserver_api/src/models.rs             | 10 +++
 libs/tracing-utils/src/perf_span.rs           | 11 +--
 pageserver/src/context.rs                     | 13 ----
 pageserver/src/page_service.rs                | 73 +++++--------------
 pageserver/src/tenant/timeline.rs             | 25 +++++++
 .../regress/test_attach_tenant_config.py      |  4 +
 8 files changed, 70 insertions(+), 77 deletions(-)
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index b39acbca4d..591eb3728b 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -545,6 +545,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<u64>())
                 .transpose()
                 .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
+            sampling_ratio: settings
+                .remove("sampling_ratio")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Falied to parse 'sampling_ratio'")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 66a02b87b0..d0225c8918 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -192,7 +192,7 @@ pub enum GetVectoredConcurrentIo {
     SidecarTask,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct Ratio {
     pub numerator: usize,
     pub denominator: usize,
@@ -416,6 +416,9 @@ pub struct TenantConfigToml {
     /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
     /// is above this ratio, gc-compaction will be triggered.
     pub gc_compaction_ratio_percent: u64,
+    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
+    /// that will get perf sampling for the tenant.
+    pub sampling_ratio: Option<Ratio>,
 }
 
 pub mod defaults {
@@ -702,6 +705,7 @@ impl Default for TenantConfigToml {
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
+            sampling_ratio: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index f2dd3a0ebf..16d9433973 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use utils::postgres_client::PostgresClientProtocol;
 use utils::{completion, serde_system_time};
 
+use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
 use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
@@ -568,6 +569,8 @@ pub struct TenantConfigPatch {
     pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_ratio_percent: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub sampling_ratio: FieldPatch<Option<Ratio>>,
 }
 
 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -688,6 +691,9 @@ pub struct TenantConfig {
 
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_ratio_percent: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sampling_ratio: Option<Option<Ratio>>,
 }
 
 impl TenantConfig {
@@ -730,6 +736,7 @@ impl TenantConfig {
             mut gc_compaction_enabled,
             mut gc_compaction_initial_threshold_kb,
             mut gc_compaction_ratio_percent,
+            mut sampling_ratio,
         } = self;
 
         patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -824,6 +831,7 @@ impl TenantConfig {
         patch
             .gc_compaction_ratio_percent
             .apply(&mut gc_compaction_ratio_percent);
+        patch.sampling_ratio.apply(&mut sampling_ratio);
 
         Ok(Self {
             checkpoint_distance,
@@ -860,6 +868,7 @@ impl TenantConfig {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
+            sampling_ratio,
         })
     }
 
@@ -961,6 +970,7 @@ impl TenantConfig {
             gc_compaction_ratio_percent: self
                 .gc_compaction_ratio_percent
                 .unwrap_or(global_conf.gc_compaction_ratio_percent),
+            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
         }
     }
 }
diff --git a/libs/tracing-utils/src/perf_span.rs b/libs/tracing-utils/src/perf_span.rs
index f2ca76a816..16f713c67e 100644
--- a/libs/tracing-utils/src/perf_span.rs
+++ b/libs/tracing-utils/src/perf_span.rs
@@ -28,7 +28,7 @@ use core::{
     task::{Context, Poll},
 };
 use pin_project_lite::pin_project;
-use tracing::{Dispatch, field, span::Span};
+use tracing::{Dispatch, span::Span};
 
 #[derive(Debug, Clone)]
 pub struct PerfSpan {
@@ -49,15 +49,6 @@ impl PerfSpan {
         }
     }
 
-    pub fn record<Q: field::AsField + ?Sized, V: field::Value>(
-        &self,
-        field: &Q,
-        value: V,
-    ) -> &Self {
-        self.inner.record(field, value);
-        self
-    }
-
     pub fn enter(&self) -> PerfSpanEntered {
         if let Some(ref id) = self.inner.id() {
             self.dispatch.enter(id);
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 279d2daf75..481fdb4ea2 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -572,19 +572,6 @@ impl RequestContext {
         }
     }
 
-    pub(crate) fn perf_span_record<
-        Q: tracing::field::AsField + ?Sized,
-        V: tracing::field::Value,
-    >(
-        &self,
-        field: &Q,
-        value: V,
-    ) {
-        if let Some(span) = &self.perf_span {
-            span.record(field, value);
-        }
-    }
-
     pub(crate) fn has_perf_span(&self) -> bool {
         self.perf_span.is_some()
     }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 3ebd6d8506..f9bf45bb71 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy, Tracing,
+    PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -37,7 +37,6 @@ use postgres_ffi::BLCKSZ;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use pq_proto::framed::ConnectionError;
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
-use rand::Rng;
 use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
@@ -755,7 +754,6 @@ impl PageServerHandler {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         timeline_handles: &mut TimelineHandles,
-        tracing_config: Option<&Tracing>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
         protocol_version: PagestreamProtocolVersion,
@@ -916,47 +914,8 @@ impl PageServerHandler {
 
                 let key = rel_block_to_key(req.rel, req.blkno);
 
-                let sampled = match tracing_config {
-                    Some(conf) => {
-                        let ratio = &conf.sampling_ratio;
-
-                        if ratio.numerator == 0 {
-                            false
-                        } else {
-                            rand::thread_rng().gen_range(0..ratio.denominator) < ratio.numerator
-                        }
-                    }
-                    None => false,
-                };
-
-                let ctx = if sampled {
-                    RequestContextBuilder::from(ctx)
-                        .root_perf_span(|| {
-                            info_span!(
-                            target: PERF_TRACE_TARGET,
-                            "GET_PAGE",
-                            tenant_id = %tenant_id,
-                            shard_id = field::Empty,
-                            timeline_id = %timeline_id,
-                            lsn = %req.hdr.request_lsn,
-                            request_id = %req.hdr.reqid,
-                            key = %key,
-                            )
-                        })
-                        .attached_child()
-                } else {
-                    ctx.attached_child()
-                };
-
                 let res = timeline_handles
                     .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .maybe_perf_instrument(&ctx, |current_perf_span| {
-                        info_span!(
-                            target: PERF_TRACE_TARGET,
-                            parent: current_perf_span,
-                            "SHARD_SELECTION",
-                        )
-                    })
                     .await;
 
                 let shard = match res {
@@ -987,6 +946,25 @@ impl PageServerHandler {
                     }
                 };
 
+                let ctx = if shard.is_get_page_request_sampled() {
+                    RequestContextBuilder::from(ctx)
+                        .root_perf_span(|| {
+                            info_span!(
+                            target: PERF_TRACE_TARGET,
+                            "GET_PAGE",
+                            tenant_id = %tenant_id,
+                            shard_id = %shard.get_shard_identity().shard_slug(),
+                            timeline_id = %timeline_id,
+                            lsn = %req.hdr.request_lsn,
+                            request_id = %req.hdr.reqid,
+                            key = %key,
+                            )
+                        })
+                        .attached_child()
+                } else {
+                    ctx.attached_child()
+                };
+
                 // This ctx travels as part of the BatchedFeMessage through
                 // batching into the request handler.
                 // The request handler needs to do some per-request work
@@ -1001,12 +979,6 @@ impl PageServerHandler {
                 // request handler log messages contain the request-specific fields.
                 let span = mkspan!(shard.tenant_shard_id.shard_slug());
 
-                // Enrich the perf span with shard_id now that shard routing is done.
-                ctx.perf_span_record(
-                    "shard_id",
-                    tracing::field::display(shard.get_shard_identity().shard_slug()),
-                );
-
                 let timer = record_op_start_and_throttle(
                     &shard,
                     metrics::SmgrQueryType::GetPageAtLsn,
@@ -1602,7 +1574,6 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
     {
         let cancel = self.cancel.clone();
-        let tracing_config = self.conf.tracing.clone();
 
         let err = loop {
             let msg = Self::pagestream_read_message(
@@ -1610,7 +1581,6 @@ impl PageServerHandler {
                 tenant_id,
                 timeline_id,
                 &mut timeline_handles,
-                tracing_config.as_ref(),
                 &cancel,
                 ctx,
                 protocol_version,
@@ -1744,8 +1714,6 @@ impl PageServerHandler {
         // Batcher
         //
 
-        let tracing_config = self.conf.tracing.clone();
-
         let cancel_batcher = self.cancel.child_token();
         let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
         let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1759,7 +1727,6 @@ impl PageServerHandler {
                         tenant_id,
                         timeline_id,
                         &mut timeline_handles,
-                        tracing_config.as_ref(),
                         &cancel_batcher,
                         &ctx,
                         protocol_version,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 74e97653d2..6ca3704bc1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2476,6 +2476,31 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
     }
 
+    /// Checks if a get page request should get perf tracing
+    ///
+    /// The configuration priority is: tenant config override, default tenant config,
+    /// pageserver config.
+    pub(crate) fn is_get_page_request_sampled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        let ratio = tenant_conf
+            .tenant_conf
+            .sampling_ratio
+            .flatten()
+            .or(self.conf.default_tenant_conf.sampling_ratio)
+            .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio));
+
+        match ratio {
+            Some(r) => {
+                if r.numerator == 0 {
+                    false
+                } else {
+                    rand::thread_rng().gen_range(0..r.denominator) < r.numerator
+                }
+            }
+            None => false,
+        }
+    }
+
     fn get_checkpoint_distance(&self) -> u64 {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 22dfcbda92..5021cc4b17 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -190,6 +190,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
         "image_creation_preempt_threshold": 5,
+        "sampling_ratio": {
+            "numerator": 0,
+            "denominator": 10,
+        },
     }
 
     vps_http = env.storage_controller.pageserver_api()