From 0c6defd8dafc2454a5bd97ee11387e69a619cf65 Mon Sep 17 00:00:00 2001
From: BodoBolero <peterbendel@neon.tech>
Date: Thu, 17 Apr 2025 17:44:21 +0200
Subject: [PATCH] many metrics removed and still compiles and can be started

---
 pageserver/src/bin/pageserver.rs              |  6 +-
 pageserver/src/metrics.rs                     | 60 +-------------
 pageserver/src/tenant.rs                      | 83 ++-----------------
 pageserver/src/tenant/secondary/downloader.rs |  3 +-
 pageserver/src/tenant/storage_layer/layer.rs  | 36 +-------
 .../walreceiver/connection_manager.rs         | 18 +---
 6 files changed, 14 insertions(+), 192 deletions(-)
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 250d4180f5..0253d8b841 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,7 +20,6 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
-use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
     BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
 };
@@ -321,10 +320,9 @@ where
     }
 }
 
-fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
+fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
     let elapsed = started_at.elapsed();
     let secs = elapsed.as_secs_f64();
-    STARTUP_DURATION.with_label_values(&[phase]).set(secs);
 
     info!(
         elapsed_ms = elapsed.as_millis(),
@@ -502,7 +500,6 @@ fn start_pageserver(
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
     startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
-    STARTUP_IS_LOADING.set(1);
 
     // Startup staging or optimizing:
     //
@@ -578,7 +575,6 @@ fn start_pageserver(
                     "initial_tenant_load",
                     "Initial load completed",
                 );
-                STARTUP_IS_LOADING.set(0);
             });
 
             let WaitForPhaseResult {
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 9cc6d74824..67f6fcc5ec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -519,36 +519,6 @@ static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_tenant_states_count",
-        "Count of tenants per state",
-        &["state"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-/// A set of broken tenants.
-///
-/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
-/// tenant.
-pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_broken_tenants_count",
-        "Set of broken tenants",
-        &["tenant_id", "shard_id"]
-    )
-    .expect("Failed to register pageserver_tenant_states_count metric")
-});
-
-pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_tenant_synthetic_cached_size_bytes",
-        "Synthetic size of each tenant in bytes",
-        &["tenant_id"]
-    )
-    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
-});
 
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
@@ -1438,14 +1408,6 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "pageserver_walreceiver_switches_total",
-        "Number of walreceiver manager change_connection calls",
-        &["reason"]
-    )
-    .expect("failed to define a metric")
-});
 
 pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
@@ -1778,10 +1740,7 @@ pub(crate) struct TimelineMetrics {
     pub imitate_logical_size_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub pitr_history_size: UIntGauge,
-    pub archival_size: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
-    pub visible_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -1844,20 +1803,11 @@ impl TimelineMetrics {
             &timeline_id,
         );
 
-        let pitr_history_size = PITR_HISTORY_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
-
-        let archival_size = TIMELINE_ARCHIVE_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
+        
 
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
-        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
         // TODO: we shouldn't expose this metric
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -1902,10 +1852,7 @@ impl TimelineMetrics {
             imitate_logical_size_histo,
             garbage_collect_histo,
             find_gc_cutoffs_histo,
-            pitr_history_size,
-            archival_size,
             standby_horizon_gauge,
-            visible_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
             directory_entries_count_gauge,
@@ -2066,11 +2013,6 @@ impl TimelineMetrics {
 }
 
 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
-    // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_shard_zero() {
-        let tid = tenant_shard_id.tenant_id.to_string();
-        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
-    }
 
     tenant_throttling::remove_tenant_metrics(tenant_shard_id);
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7ef3dae446..a9622b36c1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -84,9 +84,9 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
-    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
-    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
-    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+    CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, 
+    remove_tenant_metrics,
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
@@ -4223,55 +4223,15 @@ impl Tenant {
         let (state, mut rx) = watch::channel(state);
 
         tokio::spawn(async move {
-            // reflect tenant state in metrics:
-            // - global per tenant state: TENANT_STATE_METRIC
-            // - "set" of broken tenants: BROKEN_TENANTS_SET
-            //
-            // set of broken tenants should not have zero counts so that it remains accessible for
-            // alerting.
-
-            let tid = tenant_shard_id.to_string();
-            let shard_id = tenant_shard_id.shard_slug().to_string();
-            let set_key = &[tid.as_str(), shard_id.as_str()][..];
-
-            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
-                ([state.into()], matches!(state, TenantState::Broken { .. }))
-            }
-
-            let mut tuple = inspect_state(&rx.borrow_and_update());
-
-            let is_broken = tuple.1;
-            let mut counted_broken = if is_broken {
-                // add the id to the set right away, there should not be any updates on the channel
-                // after before tenant is removed, if ever
-                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
-                true
-            } else {
-                false
-            };
-
+            
             loop {
-                let labels = &tuple.0;
-                let current = TENANT_STATE_METRIC.with_label_values(labels);
-                current.inc();
+
 
                 if rx.changed().await.is_err() {
-                    // tenant has been dropped
-                    current.dec();
-                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
+                   
                     break;
                 }
 
-                current.dec();
-                tuple = inspect_state(&rx.borrow_and_update());
-
-                let is_broken = tuple.1;
-                if is_broken && !counted_broken {
-                    counted_broken = true;
-                    // insert the tenant_id (back) into the set while avoiding needless counter
-                    // access
-                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
-                }
             }
         });
 
@@ -4655,22 +4615,6 @@ impl Tenant {
                     }
                 }
 
-                // Update metrics that depend on GC state
-                timeline
-                    .metrics
-                    .archival_size
-                    .set(if target.within_ancestor_pitr {
-                        timeline.metrics.current_logical_size_gauge.get()
-                    } else {
-                        0
-                    });
-                timeline.metrics.pitr_history_size.set(
-                    timeline
-                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.time)
-                        .unwrap_or(Lsn(0))
-                        .0,
-                );
 
                 // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                 // - this timeline was created while we were finding cutoffs
@@ -5420,10 +5364,6 @@ impl Tenant {
         // Only shard zero should be calculating synthetic sizes
         debug_assert!(self.shard_identity.is_shard_zero());
 
-        TENANT_SYNTHETIC_SIZE_METRIC
-            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
-            .unwrap()
-            .set(size);
     }
 
     pub fn cached_synthetic_size(&self) -> u64 {
@@ -5500,16 +5440,7 @@ impl Tenant {
     /// than they report here, due to layer eviction.  Tenants with many active branches may
     /// actually use more than they report here.
     pub(crate) fn local_storage_wanted(&self) -> u64 {
-        let timelines = self.timelines.lock().unwrap();
-
-        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
-        // reflects the observation that on tenants with multiple large branches, typically only one
-        // of them is used actively enough to occupy space on disk.
-        timelines
-            .values()
-            .map(|t| t.metrics.visible_physical_size_gauge.get())
-            .max()
-            .unwrap_or(0)
+        1000
     }
 
     /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 60cf7ac79e..d0ac49b1a8 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -191,7 +191,7 @@ impl SecondaryDetailTimeline {
         tenant_shard_id: &TenantShardId,
         timeline_id: &TimelineId,
         touched: &HeatMapLayer,
-        resident_metric: &UIntGauge,
+        _resident_metric: &UIntGauge,
         local_path: F,
     ) where
         F: FnOnce() -> Utf8PathBuf,
@@ -211,7 +211,6 @@ impl SecondaryDetailTimeline {
                     touched.access_time,
                     local_path(),
                 ));
-                resident_metric.add(touched.metadata.file_size);
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 9e24327695..1b68fe2d0e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -526,12 +526,6 @@ impl Layer {
                 }
             }
 
-            // Update the timeline's visible bytes count
-            if let Some(tl) = self.0.timeline.upgrade() {
-                tl.metrics
-                    .visible_physical_size_gauge
-                    .add(self.0.desc.file_size)
-            }
         }
     }
 
@@ -540,23 +534,10 @@ impl Layer {
         use LayerVisibilityHint::*;
         match (old_visibility, visibility) {
             (Visible, Covered) => {
-                // Subtract this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    debug_assert!(
-                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
-                    );
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .sub(self.0.desc.file_size)
-                }
+                
             }
             (Covered, Visible) => {
-                // Add this layer's contribution to the visible size metric
-                if let Some(tl) = self.0.timeline.upgrade() {
-                    tl.metrics
-                        .visible_physical_size_gauge
-                        .add(self.0.desc.file_size)
-                }
+                
             }
             (Covered, Covered) | (Visible, Visible) => {
                 // no change
@@ -759,13 +740,6 @@ impl Drop for LayerInner {
             timeline.metrics.dec_layer(&self.desc);
 
             if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
-                debug_assert!(
-                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
-                );
-                timeline
-                    .metrics
-                    .visible_physical_size_gauge
-                    .sub(self.desc.file_size);
             }
         }
 
@@ -868,12 +842,6 @@ impl LayerInner {
         // This object acts as a RAII guard on these metrics: increment on construction
         timeline.metrics.inc_layer(&desc);
 
-        // New layers are visible by default. This metric is later updated on drop or in set_visibility
-        timeline
-            .metrics
-            .visible_physical_size_gauge
-            .add(desc.file_size);
-
         LayerInner {
             conf,
             path: local_path,
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 3c3608d1bd..297dec80b1 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -41,7 +41,7 @@ use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{
     WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
-    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
+    WALRECEIVER_CANDIDATES_REMOVED, 
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};
@@ -526,9 +526,6 @@ impl ConnectionManagerState {
 
     /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
     async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
-        WALRECEIVER_SWITCHES
-            .with_label_values(&[new_sk.reason.name()])
-            .inc();
 
         self.drop_old_connection(true).await;
 
@@ -1078,6 +1075,7 @@ struct NewWalConnectionCandidate {
     safekeeper_id: NodeId,
     wal_source_connconf: PgConnectionConfig,
     availability_zone: Option<String>,
+    #[allow(dead_code)]
     reason: ReconnectReason,
 }
 
@@ -1106,18 +1104,6 @@ enum ReconnectReason {
     },
 }
 
-impl ReconnectReason {
-    fn name(&self) -> &str {
-        match self {
-            ReconnectReason::NoExistingConnection => "NoExistingConnection",
-            ReconnectReason::LaggingWal { .. } => "LaggingWal",
-            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
-            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
-            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;