From 294b8a8fde50418090c8fcb2b5c1c47e8cc11557 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 25 Jul 2023 00:43:27 +0300
Subject: [PATCH] Convert per timeline metrics to global (#4769)

Cut down the per-(tenant, timeline) histograms by making them global:

- `pageserver_getpage_get_reconstruct_data_seconds`
- `pageserver_read_num_fs_layers`
- `pageserver_remote_operation_seconds`
- `pageserver_remote_timeline_client_calls_started`
- `pageserver_wait_lsn_seconds`
- `pageserver_io_operations_seconds`

---------

Co-authored-by: Shany Pozin <shany@neon.tech>
---
 pageserver/src/metrics.rs                     | 118 +++++-------------
 pageserver/src/tenant/timeline.rs             |   9 +-
 pageserver/src/virtual_file.rs                |  26 ++--
 test_runner/fixtures/metrics.py               |  25 ++--
 test_runner/regress/test_gc_aggressive.py     |   2 -
 test_runner/regress/test_metric_collection.py |   2 -
 test_runner/regress/test_ondemand_download.py |  70 +++++------
 test_runner/regress/test_remote_storage.py    |  52 ++++----
 test_runner/regress/test_tenants.py           |   3 +
 9 files changed, 126 insertions(+), 181 deletions(-)
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ee8dfba69a..bdd79711ec 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,11 +84,10 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
         "pageserver_read_num_fs_layers",
         "Number of persistent layers accessed for processing a read request, including those in the cache",
-        &["tenant_id", "timeline_id"],
         vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
     )
     .expect("failed to define a metric")
@@ -112,11 +111,10 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
         "pageserver_getpage_get_reconstruct_data_seconds",
         "Time spent in get_reconstruct_value_data",
-        &["tenant_id", "timeline_id"],
         CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
@@ -246,11 +244,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
     },
 });
 
-static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
+pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
         "pageserver_wait_lsn_seconds",
         "Time spent waiting for WAL to arrive",
-        &["tenant_id", "timeline_id"],
         CRITICAL_OP_BUCKETS.into(),
     )
     .expect("failed to define a metric")
@@ -499,23 +496,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     30.000,   // 30000 ms
 ];
 
-const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
-    "open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
-];
-
-const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
-
-pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+/// Tracks time taken by fs operations near VirtualFile.
+///
+/// Operations:
+/// - open ([`std::fs::OpenOptions::open`])
+/// - close (dropping [`std::fs::File`])
+/// - close-by-replace (close by replacement algorithm)
+/// - read (`read_at`)
+/// - write (`write_at`)
+/// - seek (modify internal position or file length query)
+/// - fsync ([`std::fs::File::sync_all`])
+/// - metadata ([`std::fs::File::metadata`])
+pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_io_operations_seconds",
         "Time spent in IO operations",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation"],
         STORAGE_IO_TIME_BUCKETS.into()
     )
     .expect("failed to define a metric")
 });
 
-pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
+
+// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
+pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_io_operations_bytes_total",
         "Total amount of bytes read/written in IO operations",
@@ -605,7 +610,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
          at a given instant. It gives you a better idea of the queue depth \
          than plotting the gauge directly, since operations may complete faster \
          than the sampling interval.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["file_kind", "op_kind"],
         // The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
         vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
     )
@@ -662,13 +667,13 @@ impl RemoteOpFileKind {
     }
 }
 
-pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_remote_operation_seconds",
         "Time spent on remote storage operations. \
         Grouped by tenant, timeline, operation_kind and status. \
         Does not account for time spent waiting in remote timeline client's queues.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
+        &["file_kind", "op_kind", "status"]
     )
     .expect("failed to define a metric")
 });
@@ -897,7 +902,6 @@ impl StorageTimeMetrics {
 pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
-    pub get_reconstruct_data_time_histo: Histogram,
     pub flush_time_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
@@ -906,9 +910,7 @@ pub struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
-    pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
-    pub read_num_fs_layers: Histogram,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -925,9 +927,6 @@ impl TimelineMetrics {
     ) -> Self {
         let tenant_id = tenant_id.to_string();
         let timeline_id = timeline_id.to_string();
-        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
         let flush_time_histo =
             StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
         let compact_time_histo =
@@ -948,9 +947,6 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -966,16 +962,12 @@ impl TimelineMetrics {
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let read_num_fs_layers = READ_NUM_FS_LAYERS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
-            .unwrap();
         let evictions_with_low_residence_duration =
             evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
 
         TimelineMetrics {
             tenant_id,
             timeline_id,
-            get_reconstruct_data_time_histo,
             flush_time_histo,
             compact_time_histo,
             create_images_time_histo,
@@ -984,7 +976,6 @@ impl TimelineMetrics {
             garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
-            wait_lsn_time_histo,
             resident_physical_size_gauge,
             current_logical_size_gauge,
             num_persistent_files_created,
@@ -993,7 +984,6 @@ impl TimelineMetrics {
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
             ),
-            read_num_fs_layers,
         }
     }
 }
@@ -1002,15 +992,12 @@ impl Drop for TimelineMetrics {
     fn drop(&mut self) {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
-        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
         let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
 
         self.evictions_with_low_residence_duration
             .write()
@@ -1022,9 +1009,6 @@ impl Drop for TimelineMetrics {
             let _ =
                 STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
         }
-        for op in STORAGE_IO_TIME_OPERATIONS {
-            let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
-        }
 
         for op in STORAGE_IO_SIZE_OPERATIONS {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -1056,9 +1040,7 @@ pub struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
-    remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
     calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
-    calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
 }
@@ -1068,14 +1050,13 @@ impl RemoteTimelineClientMetrics {
         RemoteTimelineClientMetrics {
             tenant_id: tenant_id.to_string(),
             timeline_id: timeline_id.to_string(),
-            remote_operation_time: Mutex::new(HashMap::default()),
             calls_unfinished_gauge: Mutex::new(HashMap::default()),
-            calls_started_hist: Mutex::new(HashMap::default()),
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge: Mutex::new(None),
         }
     }
+
     pub fn remote_physical_size_gauge(&self) -> UIntGauge {
         let mut guard = self.remote_physical_size_gauge.lock().unwrap();
         guard
@@ -1089,26 +1070,17 @@ impl RemoteTimelineClientMetrics {
             })
             .clone()
     }
+
     pub fn remote_operation_time(
         &self,
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
         status: &'static str,
     ) -> Histogram {
-        let mut guard = self.remote_operation_time.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str(), status);
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_OPERATION_TIME
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                    key.2,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_OPERATION_TIME
+            .get_metric_with_label_values(&[key.0, key.1, key.2])
+            .unwrap()
     }
 
     fn calls_unfinished_gauge(
@@ -1136,19 +1108,10 @@ impl RemoteTimelineClientMetrics {
         file_kind: &RemoteOpFileKind,
         op_kind: &RemoteOpKind,
     ) -> Histogram {
-        let mut guard = self.calls_started_hist.lock().unwrap();
         let key = (file_kind.as_str(), op_kind.as_str());
-        let metric = guard.entry(key).or_insert_with(move || {
-            REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
-                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
-                    key.0,
-                    key.1,
-                ])
-                .unwrap()
-        });
-        metric.clone()
+        REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
+            .get_metric_with_label_values(&[key.0, key.1])
+            .unwrap()
     }
 
     fn bytes_started_counter(
@@ -1328,15 +1291,10 @@ impl Drop for RemoteTimelineClientMetrics {
             tenant_id,
             timeline_id,
             remote_physical_size_gauge,
-            remote_operation_time,
             calls_unfinished_gauge,
-            calls_started_hist,
             bytes_started_counter,
             bytes_finished_counter,
         } = self;
-        for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
-            let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
-        }
         for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
             let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                 tenant_id,
@@ -1345,14 +1303,6 @@ impl Drop for RemoteTimelineClientMetrics {
                 b,
             ]);
         }
-        for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
-            let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
-                tenant_id,
-                timeline_id,
-                a,
-                b,
-            ]);
-        }
         for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
             let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                 tenant_id,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index eceb54efc5..3413282c5d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -475,7 +475,7 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
         self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
         timer.stop_and_record();
@@ -555,7 +555,7 @@ impl Timeline {
             "wait_lsn cannot be called in WAL receiver"
         );
 
-        let _timer = self.metrics.wait_lsn_time_histo.start_timer();
+        let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
 
         match self
             .last_record_lsn
@@ -2252,8 +2252,9 @@ impl Timeline {
         let mut timeline_owned;
         let mut timeline = self;
 
-        let mut read_count =
-            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+        let mut read_count = scopeguard::guard(0, |cnt| {
+            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+        });
 
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b86d14f158..65b4363f60 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -149,12 +149,10 @@ impl OpenFiles {
         // old file.
         //
         if let Some(old_file) = slot_guard.file.take() {
-            // We do not have information about tenant_id/timeline_id of evicted file.
-            // It is possible to store path together with file or use filepath crate,
-            // but as far as close() is not expected to be fast, it is not so critical to gather
-            // precise per-tenant statistic here.
+            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
+            // distinguish the two.
             STORAGE_IO_TIME
-                .with_label_values(&["close", "-", "-"])
+                .with_label_values(&["close-by-replace"])
                 .observe_closure_duration(|| drop(old_file));
         }
 
@@ -208,7 +206,7 @@ impl VirtualFile {
         }
         let (handle, mut slot_guard) = get_open_files().find_victim_slot();
         let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &tenant_id, &timeline_id])
+            .with_label_values(&["open"])
             .observe_closure_duration(|| open_options.open(path))?;
 
         // Strip all options other than read and write.
@@ -271,7 +269,7 @@ impl VirtualFile {
                             // Found a cached file descriptor.
                             slot.recently_used.store(true, Ordering::Relaxed);
                             return Ok(STORAGE_IO_TIME
-                                .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+                                .with_label_values(&[op])
                                 .observe_closure_duration(|| func(file)));
                         }
                     }
@@ -298,12 +296,12 @@ impl VirtualFile {
 
         // Open the physical file
         let file = STORAGE_IO_TIME
-            .with_label_values(&["open", &self.tenant_id, &self.timeline_id])
+            .with_label_values(&["open"])
             .observe_closure_duration(|| self.open_options.open(&self.path))?;
 
         // Perform the requested operation on it
         let result = STORAGE_IO_TIME
-            .with_label_values(&[op, &self.tenant_id, &self.timeline_id])
+            .with_label_values(&[op])
             .observe_closure_duration(|| func(&file));
 
         // Store the File in the slot and update the handle in the VirtualFile
@@ -333,13 +331,11 @@ impl Drop for VirtualFile {
         let mut slot_guard = slot.inner.write().unwrap();
         if slot_guard.tag == handle.tag {
             slot.recently_used.store(false, Ordering::Relaxed);
-            // Unlike files evicted by replacement algorithm, here
-            // we group close time by tenant_id/timeline_id.
-            // At allows to compare number/time of "normal" file closes
-            // with file eviction.
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
             STORAGE_IO_TIME
-                .with_label_values(&["close", &self.tenant_id, &self.timeline_id])
-                .observe_closure_duration(|| slot_guard.file.take());
+                .with_label_values(&["close"])
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
         }
     }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 3f87aa10a3..ca468cc02c 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -40,10 +40,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     return metrics
 
 
+def histogram(prefix_without_trailing_underscore: str) -> List[str]:
+    assert not prefix_without_trailing_underscore.endswith("_")
+    return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]]
+
+
 PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_timeline_client_calls_unfinished",
-    *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]],
-    *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]],
     "pageserver_remote_physical_size",
     "pageserver_remote_timeline_client_bytes_started_total",
     "pageserver_remote_timeline_client_bytes_finished_total",
@@ -67,30 +70,24 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",
     *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
+    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
+    *histogram("pageserver_wait_lsn_seconds"),
+    *histogram("pageserver_remote_operation_seconds"),
+    *histogram("pageserver_remote_timeline_client_calls_started"),
+    *histogram("pageserver_io_operations_seconds"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
-    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
-    "pageserver_getpage_get_reconstruct_data_seconds_count",
-    "pageserver_getpage_get_reconstruct_data_seconds_sum",
     "pageserver_io_operations_bytes_total",
-    "pageserver_io_operations_seconds_bucket",
-    "pageserver_io_operations_seconds_count",
-    "pageserver_io_operations_seconds_sum",
     "pageserver_last_record_lsn",
-    "pageserver_read_num_fs_layers_bucket",
-    "pageserver_read_num_fs_layers_count",
-    "pageserver_read_num_fs_layers_sum",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
-    "pageserver_wait_lsn_seconds_bucket",
-    "pageserver_wait_lsn_seconds_count",
-    "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
     "pageserver_tenant_states_count",
diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py
index 18f506cfce..e26b1a980e 100644
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -136,8 +136,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
         for sample in ps_metrics.query_all(
             name="pageserver_remote_operation_seconds_count",
             filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),
             },
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 12e695bcbd..61c8f800f8 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -140,8 +140,6 @@ def test_metric_collection(
         for sample in ps_metrics.query_all(
             name="pageserver_remote_operation_seconds_count",
             filter={
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
                 "file_kind": str(file_kind),
                 "op_kind": str(op_kind),
             },
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index a30f0f02e1..b92981a371 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -27,15 +27,16 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar, wait_until
 
 
-def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
+def get_num_downloaded_layers(client: PageserverHttpClient):
+    """
+    This assumes that the pageserver only has a single tenant.
+    """
     value = client.get_metric_value(
         "pageserver_remote_operation_seconds_count",
         {
             "file_kind": "layer",
             "op_kind": "download",
             "status": "success",
-            "tenant_id": tenant_id,
-            "timeline_id": timeline_id,
         },
     )
     if value is None:
@@ -57,7 +58,8 @@ def test_ondemand_download_large_rel(
         test_name="test_ondemand_download_large_rel",
     )
 
-    ##### First start, insert secret data and upload it to the remote storage
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             # disable background GC
@@ -129,7 +131,7 @@ def test_ondemand_download_large_rel(
     # safekeepers, that have now been shut down.
     endpoint = env.endpoints.create_start("main", lsn=current_lsn)
 
-    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    before_downloads = get_num_downloaded_layers(client)
     assert before_downloads != 0, "basebackup should on-demand non-zero layers"
 
     # Probe in the middle of the table. There's a high chance that the beginning
@@ -140,7 +142,7 @@ def test_ondemand_download_large_rel(
     with endpoint.cursor() as cur:
         assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1
 
-    after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    after_downloads = get_num_downloaded_layers(client)
     log.info(f"layers downloaded before {before_downloads} and after {after_downloads}")
     assert after_downloads > before_downloads
 
@@ -159,13 +161,11 @@ def test_ondemand_download_timetravel(
         test_name="test_ondemand_download_timetravel",
     )
 
-    ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start()
-    pageserver_http = env.pageserver.http_client()
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.
 
-    # Override defaults, to create more layers
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # Disable background GC & compaction
             # We don't want GC, that would break the assertion about num downloads.
             # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -178,7 +178,7 @@ def test_ondemand_download_timetravel(
             "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
         }
     )
-    env.initial_tenant = tenant
+    pageserver_http = env.pageserver.http_client()
 
     endpoint = env.endpoints.create_start("main")
 
@@ -283,7 +283,7 @@ def test_ondemand_download_timetravel(
                 == table_len
             )
 
-        after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+        after_downloads = get_num_downloaded_layers(client)
         num_layers_downloaded.append(after_downloads)
         log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}")
 
@@ -324,11 +324,8 @@ def test_download_remote_layers_api(
     )
 
     ##### First start, insert data and upload it to the remote storage
-    env = neon_env_builder.init_start()
-
-    # Override defaults, to create more layers
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # Disable background GC & compaction
             # We don't want GC, that would break the assertion about num downloads.
             # We don't want background compaction, we force a compaction every time we do explicit checkpoint.
@@ -341,7 +338,6 @@ def test_download_remote_layers_api(
             "compaction_target_size": f"{1 * 1024 ** 2}",  # 1 MB
         }
     )
-    env.initial_tenant = tenant
 
     endpoint = env.endpoints.create_start("main")
 
@@ -489,8 +485,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
         test_name="test_compaction_downloads_on_demand_without_image_creation",
     )
 
-    env = neon_env_builder.init_start()
-
     conf = {
         # Disable background GC & compaction
         "gc_period": "0s",
@@ -506,6 +500,8 @@ def test_compaction_downloads_on_demand_without_image_creation(
         # pitr_interval and gc_horizon are not interesting because we dont run gc
     }
 
+    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
+
     def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
         m = pageserver_http.get_metrics()
         # these are global counters
@@ -517,11 +513,12 @@ def test_compaction_downloads_on_demand_without_image_creation(
         assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
         return (int(total_bytes), int(count))
 
-    # Override defaults, to create more layers
-    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
-    env.initial_tenant = tenant_id
     pageserver_http = env.pageserver.http_client()
 
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
     with env.endpoints.create_start("main") as endpoint:
         # no particular reason to create the layers like this, but we are sure
         # not to hit the image_creation_threshold here.
@@ -577,8 +574,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
         test_name="test_compaction_downloads_on_demand",
     )
 
-    env = neon_env_builder.init_start()
-
     conf = {
         # Disable background GC & compaction
         "gc_period": "0s",
@@ -593,9 +588,11 @@ def test_compaction_downloads_on_demand_with_image_creation(
         # pitr_interval and gc_horizon are not interesting because we dont run gc
     }
 
-    # Override defaults, to create more layers
-    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
-    env.initial_tenant = tenant_id
+    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
+
     pageserver_http = env.pageserver.http_client()
 
     endpoint = env.endpoints.create_start("main")
@@ -664,10 +661,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
     assert dict(kinds_after) == {"Delta": 4, "Image": 1}
 
 
-def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
-    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
-
-
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_ondemand_download_failure_to_replace(
     neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
@@ -691,9 +684,10 @@ def test_ondemand_download_failure_to_replace(
 
     env = neon_env_builder.init_start()
 
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    assert timeline_id is not None
 
-    env.initial_tenant = tenant_id
     pageserver_http = env.pageserver.http_client()
 
     lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
@@ -724,3 +718,7 @@ def test_ondemand_download_failure_to_replace(
     env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
 
     # if the above returned, then we didn't have a livelock, and all is well
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index f1575ae4d3..7c04ed9017 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -378,12 +378,10 @@ def test_remote_timeline_client_calls_started_metric(
         test_name="test_remote_timeline_client_metrics",
     )
 
-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    # thinking about using a shared environment? the test assumes that global
+    # metrics are for single tenant.
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # small checkpointing and compaction targets to ensure we generate many upload operations
             "checkpoint_distance": f"{128 * 1024}",
             "compaction_threshold": "1",
@@ -398,6 +396,10 @@ def test_remote_timeline_client_calls_started_metric(
         }
     )
 
+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
     client = env.pageserver.http_client()
 
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -419,6 +421,7 @@ def test_remote_timeline_client_calls_started_metric(
                 "VACUUM foo",
             ]
         )
+        assert timeline_id is not None
         wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     calls_started: Dict[Tuple[str, str], List[int]] = {
@@ -428,13 +431,14 @@ def test_remote_timeline_client_calls_started_metric(
     }
 
     def fetch_calls_started():
+        assert timeline_id is not None
         for (file_kind, op_kind), observations in calls_started.items():
-            val = client.get_remote_timeline_client_metric(
-                "pageserver_remote_timeline_client_calls_started_count",
-                tenant_id,
-                timeline_id,
-                file_kind,
-                op_kind,
+            val = client.get_metric_value(
+                name="pageserver_remote_timeline_client_calls_started_count",
+                filter={
+                    "file_kind": str(file_kind),
+                    "op_kind": str(op_kind),
+                },
             )
             assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}"
             val = int(val)
@@ -518,12 +522,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
         test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
     )
 
-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # small checkpointing and compaction targets to ensure we generate many operations
             "checkpoint_distance": f"{64 * 1024}",
             "compaction_threshold": "1",
@@ -535,6 +535,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             "pitr_interval": "0s",
         }
     )
+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
     timeline_path = env.timeline_dir(tenant_id, timeline_id)
 
     client = env.pageserver.http_client()
@@ -787,12 +791,8 @@ def test_compaction_delete_before_upload(
         test_name="test_compaction_delete_before_upload",
     )
 
-    env = neon_env_builder.init_start()
-
-    # create tenant with config that will determinstically allow
-    # compaction and disables gc
-    tenant_id, timeline_id = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # Set a small compaction threshold
             "compaction_threshold": "3",
             # Disable GC
@@ -802,6 +802,10 @@ def test_compaction_delete_before_upload(
         }
     )
 
+    tenant_id = env.initial_tenant
+    assert env.initial_timeline is not None
+    timeline_id: TimelineId = env.initial_timeline
+
     client = env.pageserver.http_client()
 
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 4dbfa8bc1f..35f7941bda 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -213,6 +213,9 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
 
     # Test (a subset of) pageserver global metrics
     for metric in PAGESERVER_GLOBAL_METRICS:
+        if metric.startswith("pageserver_remote"):
+            continue
+
         ps_samples = ps_metrics.query_all(metric, {})
         assert len(ps_samples) > 0, f"expected at least one sample for {metric}"
         for sample in ps_samples: