From 294b8a8fde50418090c8fcb2b5c1c47e8cc11557 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 25 Jul 2023 00:43:27 +0300 Subject: [PATCH] Convert per timeline metrics to global (#4769) Cut down the per-(tenant, timeline) histograms by making them global: - `pageserver_getpage_get_reconstruct_data_seconds` - `pageserver_read_num_fs_layers` - `pageserver_remote_operation_seconds` - `pageserver_remote_timeline_client_calls_started` - `pageserver_wait_lsn_seconds` - `pageserver_io_operations_seconds` --------- Co-authored-by: Shany Pozin --- pageserver/src/metrics.rs | 118 +++++------------- pageserver/src/tenant/timeline.rs | 9 +- pageserver/src/virtual_file.rs | 26 ++-- test_runner/fixtures/metrics.py | 25 ++-- test_runner/regress/test_gc_aggressive.py | 2 - test_runner/regress/test_metric_collection.py | 2 - test_runner/regress/test_ondemand_download.py | 70 +++++------ test_runner/regress/test_remote_storage.py | 52 ++++---- test_runner/regress/test_tenants.py | 3 + 9 files changed, 126 insertions(+), 181 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ee8dfba69a..bdd79711ec 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,11 +84,10 @@ pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { - register_histogram_vec!( +pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { + register_histogram!( "pageserver_read_num_fs_layers", "Number of persistent layers accessed for processing a read request, including those in the cache", - &["tenant_id", "timeline_id"], vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], ) .expect("failed to define a metric") @@ -112,11 +111,10 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + register_histogram!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", - &["tenant_id", "timeline_id"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") @@ -246,11 +244,10 @@ pub static PAGE_CACHE_SIZE: Lazy = Lazy::new(|| PageCacheS }, }); -static WAIT_LSN_TIME: Lazy = Lazy::new(|| { - register_histogram_vec!( +pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { + register_histogram!( "pageserver_wait_lsn_seconds", "Time spent waiting for WAL to arrive", - &["tenant_id", "timeline_id"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") @@ -499,23 +496,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 30.000, // 30000 ms ]; -const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ - "open", "close", "read", "write", "seek", "fsync", "gc", "metadata", -]; - -const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; - -pub static STORAGE_IO_TIME: Lazy = Lazy::new(|| { +/// Tracks time taken by fs operations near VirtualFile. +/// +/// Operations: +/// - open ([`std::fs::OpenOptions::open`]) +/// - close (dropping [`std::fs::File`]) +/// - close-by-replace (close by replacement algorithm) +/// - read (`read_at`) +/// - write (`write_at`) +/// - seek (modify internal position or file length query) +/// - fsync ([`std::fs::File::sync_all`]) +/// - metadata ([`std::fs::File::metadata`]) +pub(crate) static STORAGE_IO_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_io_operations_seconds", "Time spent in IO operations", - &["operation", "tenant_id", "timeline_id"], + &["operation"], STORAGE_IO_TIME_BUCKETS.into() ) .expect("failed to define a metric") }); -pub static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { +const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"]; + +// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1 +pub(crate) static STORAGE_IO_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_io_operations_bytes_total", "Total amount of bytes read/written in IO operations", @@ -605,7 +610,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy = Lazy::new at a given instant. It gives you a better idea of the queue depth \ than plotting the gauge directly, since operations may complete faster \ than the sampling interval.", - &["tenant_id", "timeline_id", "file_kind", "op_kind"], + &["file_kind", "op_kind"], // The calls_unfinished gauge is an integer gauge, hence we have integer buckets. vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0], ) @@ -662,13 +667,13 @@ impl RemoteOpFileKind { } } -pub static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { +pub(crate) static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_remote_operation_seconds", "Time spent on remote storage operations. \ Grouped by tenant, timeline, operation_kind and status. \ Does not account for time spent waiting in remote timeline client's queues.", - &["tenant_id", "timeline_id", "file_kind", "op_kind", "status"] + &["file_kind", "op_kind", "status"] ) .expect("failed to define a metric") }); @@ -897,7 +902,6 @@ impl StorageTimeMetrics { pub struct TimelineMetrics { tenant_id: String, timeline_id: String, - pub get_reconstruct_data_time_histo: Histogram, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -906,9 +910,7 @@ pub struct TimelineMetrics { pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, - pub wait_lsn_time_histo: Histogram, pub resident_physical_size_gauge: UIntGauge, - pub read_num_fs_layers: Histogram, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -925,9 +927,6 @@ impl TimelineMetrics { ) -> Self { let tenant_id = tenant_id.to_string(); let timeline_id = timeline_id.to_string(); - let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); let flush_time_histo = StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); let compact_time_histo = @@ -948,9 +947,6 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let wait_lsn_time_histo = WAIT_LSN_TIME - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -966,16 +962,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let read_num_fs_layers = READ_NUM_FS_LAYERS - .get_metric_with_label_values(&[&tenant_id, &timeline_id]) - .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); TimelineMetrics { tenant_id, timeline_id, - get_reconstruct_data_time_histo, flush_time_histo, compact_time_histo, create_images_time_histo, @@ -984,7 +976,6 @@ impl TimelineMetrics { garbage_collect_histo, load_layer_map_histo, last_record_gauge, - wait_lsn_time_histo, resident_physical_size_gauge, current_logical_size_gauge, num_persistent_files_created, @@ -993,7 +984,6 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), - read_num_fs_layers, } } } @@ -1002,15 +992,12 @@ impl Drop for TimelineMetrics { fn drop(&mut self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; - let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); - let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); - let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]); self.evictions_with_low_residence_duration .write() @@ -1022,9 +1009,6 @@ impl Drop for TimelineMetrics { let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]); } - for op in STORAGE_IO_TIME_OPERATIONS { - let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]); - } for op in STORAGE_IO_SIZE_OPERATIONS { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]); @@ -1056,9 +1040,7 @@ pub struct RemoteTimelineClientMetrics { tenant_id: String, timeline_id: String, remote_physical_size_gauge: Mutex>, - remote_operation_time: Mutex>, calls_unfinished_gauge: Mutex>, - calls_started_hist: Mutex>, bytes_started_counter: Mutex>, bytes_finished_counter: Mutex>, } @@ -1068,14 +1050,13 @@ impl RemoteTimelineClientMetrics { RemoteTimelineClientMetrics { tenant_id: tenant_id.to_string(), timeline_id: timeline_id.to_string(), - remote_operation_time: Mutex::new(HashMap::default()), calls_unfinished_gauge: Mutex::new(HashMap::default()), - calls_started_hist: Mutex::new(HashMap::default()), bytes_started_counter: Mutex::new(HashMap::default()), bytes_finished_counter: Mutex::new(HashMap::default()), remote_physical_size_gauge: Mutex::new(None), } } + pub fn remote_physical_size_gauge(&self) -> UIntGauge { let mut guard = self.remote_physical_size_gauge.lock().unwrap(); guard @@ -1089,26 +1070,17 @@ impl RemoteTimelineClientMetrics { }) .clone() } + pub fn remote_operation_time( &self, file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, status: &'static str, ) -> Histogram { - let mut guard = self.remote_operation_time.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str(), status); - let metric = guard.entry(key).or_insert_with(move || { - REMOTE_OPERATION_TIME - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - key.0, - key.1, - key.2, - ]) - .unwrap() - }); - metric.clone() + REMOTE_OPERATION_TIME + .get_metric_with_label_values(&[key.0, key.1, key.2]) + .unwrap() } fn calls_unfinished_gauge( @@ -1136,19 +1108,10 @@ impl RemoteTimelineClientMetrics { file_kind: &RemoteOpFileKind, op_kind: &RemoteOpKind, ) -> Histogram { - let mut guard = self.calls_started_hist.lock().unwrap(); let key = (file_kind.as_str(), op_kind.as_str()); - let metric = guard.entry(key).or_insert_with(move || { - REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST - .get_metric_with_label_values(&[ - &self.tenant_id.to_string(), - &self.timeline_id.to_string(), - key.0, - key.1, - ]) - .unwrap() - }); - metric.clone() + REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST + .get_metric_with_label_values(&[key.0, key.1]) + .unwrap() } fn bytes_started_counter( @@ -1328,15 +1291,10 @@ impl Drop for RemoteTimelineClientMetrics { tenant_id, timeline_id, remote_physical_size_gauge, - remote_operation_time, calls_unfinished_gauge, - calls_started_hist, bytes_started_counter, bytes_finished_counter, } = self; - for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() { - let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]); - } for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[ tenant_id, @@ -1345,14 +1303,6 @@ impl Drop for RemoteTimelineClientMetrics { b, ]); } - for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() { - let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[ - tenant_id, - timeline_id, - a, - b, - ]); - } for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() { let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[ tenant_id, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index eceb54efc5..3413282c5d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -475,7 +475,7 @@ impl Timeline { img: cached_page_img, }; - let timer = self.metrics.get_reconstruct_data_time_histo.start_timer(); + let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; timer.stop_and_record(); @@ -555,7 +555,7 @@ impl Timeline { "wait_lsn cannot be called in WAL receiver" ); - let _timer = self.metrics.wait_lsn_time_histo.start_timer(); + let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); match self .last_record_lsn @@ -2252,8 +2252,9 @@ impl Timeline { let mut timeline_owned; let mut timeline = self; - let mut read_count = - scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64)); + let mut read_count = scopeguard::guard(0, |cnt| { + crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + }); // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b86d14f158..65b4363f60 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -149,12 +149,10 @@ impl OpenFiles { // old file. // if let Some(old_file) = slot_guard.file.take() { - // We do not have information about tenant_id/timeline_id of evicted file. - // It is possible to store path together with file or use filepath crate, - // but as far as close() is not expected to be fast, it is not so critical to gather - // precise per-tenant statistic here. + // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to + // distinguish the two. STORAGE_IO_TIME - .with_label_values(&["close", "-", "-"]) + .with_label_values(&["close-by-replace"]) .observe_closure_duration(|| drop(old_file)); } @@ -208,7 +206,7 @@ impl VirtualFile { } let (handle, mut slot_guard) = get_open_files().find_victim_slot(); let file = STORAGE_IO_TIME - .with_label_values(&["open", &tenant_id, &timeline_id]) + .with_label_values(&["open"]) .observe_closure_duration(|| open_options.open(path))?; // Strip all options other than read and write. @@ -271,7 +269,7 @@ impl VirtualFile { // Found a cached file descriptor. slot.recently_used.store(true, Ordering::Relaxed); return Ok(STORAGE_IO_TIME - .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) + .with_label_values(&[op]) .observe_closure_duration(|| func(file))); } } @@ -298,12 +296,12 @@ impl VirtualFile { // Open the physical file let file = STORAGE_IO_TIME - .with_label_values(&["open", &self.tenant_id, &self.timeline_id]) + .with_label_values(&["open"]) .observe_closure_duration(|| self.open_options.open(&self.path))?; // Perform the requested operation on it let result = STORAGE_IO_TIME - .with_label_values(&[op, &self.tenant_id, &self.timeline_id]) + .with_label_values(&[op]) .observe_closure_duration(|| func(&file)); // Store the File in the slot and update the handle in the VirtualFile @@ -333,13 +331,11 @@ impl Drop for VirtualFile { let mut slot_guard = slot.inner.write().unwrap(); if slot_guard.tag == handle.tag { slot.recently_used.store(false, Ordering::Relaxed); - // Unlike files evicted by replacement algorithm, here - // we group close time by tenant_id/timeline_id. - // At allows to compare number/time of "normal" file closes - // with file eviction. + // there is also operation "close-by-replace" for closes done on eviction for + // comparison. STORAGE_IO_TIME - .with_label_values(&["close", &self.tenant_id, &self.timeline_id]) - .observe_closure_duration(|| slot_guard.file.take()); + .with_label_values(&["close"]) + .observe_closure_duration(|| drop(slot_guard.file.take())); } } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 3f87aa10a3..ca468cc02c 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -40,10 +40,13 @@ def parse_metrics(text: str, name: str = "") -> Metrics: return metrics +def histogram(prefix_without_trailing_underscore: str) -> List[str]: + assert not prefix_without_trailing_underscore.endswith("_") + return [f"{prefix_without_trailing_underscore}_{x}" for x in ["bucket", "count", "sum"]] + + PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = ( "pageserver_remote_timeline_client_calls_unfinished", - *[f"pageserver_remote_timeline_client_calls_started_{x}" for x in ["bucket", "count", "sum"]], - *[f"pageserver_remote_operation_seconds_{x}" for x in ["bucket", "count", "sum"]], "pageserver_remote_physical_size", "pageserver_remote_timeline_client_bytes_started_total", "pageserver_remote_timeline_client_bytes_finished_total", @@ -67,30 +70,24 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], + *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_getpage_get_reconstruct_data_seconds"), + *histogram("pageserver_wait_lsn_seconds"), + *histogram("pageserver_remote_operation_seconds"), + *histogram("pageserver_remote_timeline_client_calls_started"), + *histogram("pageserver_io_operations_seconds"), ) PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_current_logical_size", "pageserver_resident_physical_size", - "pageserver_getpage_get_reconstruct_data_seconds_bucket", - "pageserver_getpage_get_reconstruct_data_seconds_count", - "pageserver_getpage_get_reconstruct_data_seconds_sum", "pageserver_io_operations_bytes_total", - "pageserver_io_operations_seconds_bucket", - "pageserver_io_operations_seconds_count", - "pageserver_io_operations_seconds_sum", "pageserver_last_record_lsn", - "pageserver_read_num_fs_layers_bucket", - "pageserver_read_num_fs_layers_count", - "pageserver_read_num_fs_layers_sum", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", - "pageserver_wait_lsn_seconds_bucket", - "pageserver_wait_lsn_seconds_count", - "pageserver_wait_lsn_seconds_sum", "pageserver_created_persistent_files_total", "pageserver_written_persistent_bytes_total", "pageserver_tenant_states_count", diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 18f506cfce..e26b1a980e 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -136,8 +136,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: for sample in ps_metrics.query_all( name="pageserver_remote_operation_seconds_count", filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), "file_kind": str(file_kind), "op_kind": str(op_kind), }, diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 12e695bcbd..61c8f800f8 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -140,8 +140,6 @@ def test_metric_collection( for sample in ps_metrics.query_all( name="pageserver_remote_operation_seconds_count", filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), "file_kind": str(file_kind), "op_kind": str(op_kind), }, diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index a30f0f02e1..b92981a371 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -27,15 +27,16 @@ from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until -def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id): +def get_num_downloaded_layers(client: PageserverHttpClient): + """ + This assumes that the pageserver only has a single tenant. + """ value = client.get_metric_value( "pageserver_remote_operation_seconds_count", { "file_kind": "layer", "op_kind": "download", "status": "success", - "tenant_id": tenant_id, - "timeline_id": timeline_id, }, ) if value is None: @@ -57,7 +58,8 @@ def test_ondemand_download_large_rel( test_name="test_ondemand_download_large_rel", ) - ##### First start, insert secret data and upload it to the remote storage + # thinking about using a shared environment? the test assumes that global + # metrics are for single tenant. env = neon_env_builder.init_start( initial_tenant_conf={ # disable background GC @@ -129,7 +131,7 @@ def test_ondemand_download_large_rel( # safekeepers, that have now been shut down. endpoint = env.endpoints.create_start("main", lsn=current_lsn) - before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + before_downloads = get_num_downloaded_layers(client) assert before_downloads != 0, "basebackup should on-demand non-zero layers" # Probe in the middle of the table. There's a high chance that the beginning @@ -140,7 +142,7 @@ def test_ondemand_download_large_rel( with endpoint.cursor() as cur: assert query_scalar(cur, "select count(*) from tbl where id = 500000") == 1 - after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + after_downloads = get_num_downloaded_layers(client) log.info(f"layers downloaded before {before_downloads} and after {after_downloads}") assert after_downloads > before_downloads @@ -159,13 +161,11 @@ def test_ondemand_download_timetravel( test_name="test_ondemand_download_timetravel", ) - ##### First start, insert data and upload it to the remote storage - env = neon_env_builder.init_start() - pageserver_http = env.pageserver.http_client() + # thinking about using a shared environment? the test assumes that global + # metrics are for single tenant. - # Override defaults, to create more layers - tenant, _ = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # Disable background GC & compaction # We don't want GC, that would break the assertion about num downloads. # We don't want background compaction, we force a compaction every time we do explicit checkpoint. @@ -178,7 +178,7 @@ def test_ondemand_download_timetravel( "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) - env.initial_tenant = tenant + pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -283,7 +283,7 @@ def test_ondemand_download_timetravel( == table_len ) - after_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id) + after_downloads = get_num_downloaded_layers(client) num_layers_downloaded.append(after_downloads) log.info(f"num_layers_downloaded[-1]={num_layers_downloaded[-1]}") @@ -324,11 +324,8 @@ def test_download_remote_layers_api( ) ##### First start, insert data and upload it to the remote storage - env = neon_env_builder.init_start() - - # Override defaults, to create more layers - tenant, _ = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # Disable background GC & compaction # We don't want GC, that would break the assertion about num downloads. # We don't want background compaction, we force a compaction every time we do explicit checkpoint. @@ -341,7 +338,6 @@ def test_download_remote_layers_api( "compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB } ) - env.initial_tenant = tenant endpoint = env.endpoints.create_start("main") @@ -489,8 +485,6 @@ def test_compaction_downloads_on_demand_without_image_creation( test_name="test_compaction_downloads_on_demand_without_image_creation", ) - env = neon_env_builder.init_start() - conf = { # Disable background GC & compaction "gc_period": "0s", @@ -506,6 +500,8 @@ def test_compaction_downloads_on_demand_without_image_creation( # pitr_interval and gc_horizon are not interesting because we dont run gc } + env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf)) + def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]: m = pageserver_http.get_metrics() # these are global counters @@ -517,11 +513,12 @@ def test_compaction_downloads_on_demand_without_image_creation( assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64" return (int(total_bytes), int(count)) - # Override defaults, to create more layers - tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) - env.initial_tenant = tenant_id pageserver_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + assert timeline_id is not None + with env.endpoints.create_start("main") as endpoint: # no particular reason to create the layers like this, but we are sure # not to hit the image_creation_threshold here. @@ -577,8 +574,6 @@ def test_compaction_downloads_on_demand_with_image_creation( test_name="test_compaction_downloads_on_demand", ) - env = neon_env_builder.init_start() - conf = { # Disable background GC & compaction "gc_period": "0s", @@ -593,9 +588,11 @@ def test_compaction_downloads_on_demand_with_image_creation( # pitr_interval and gc_horizon are not interesting because we dont run gc } - # Override defaults, to create more layers - tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) - env.initial_tenant = tenant_id + env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf)) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + assert timeline_id is not None + pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") @@ -664,10 +661,6 @@ def test_compaction_downloads_on_demand_with_image_creation( assert dict(kinds_after) == {"Delta": 4, "Image": 1} -def stringify(conf: Dict[str, Any]) -> Dict[str, str]: - return dict(map(lambda x: (x[0], str(x[1])), conf.items())) - - @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) def test_ondemand_download_failure_to_replace( neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind @@ -691,9 +684,10 @@ def test_ondemand_download_failure_to_replace( env = neon_env_builder.init_start() - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + assert timeline_id is not None - env.initial_tenant = tenant_id pageserver_http = env.pageserver.http_client() lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) @@ -724,3 +718,7 @@ def test_ondemand_download_failure_to_replace( env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'") # if the above returned, then we didn't have a livelock, and all is well + + +def stringify(conf: Dict[str, Any]) -> Dict[str, str]: + return dict(map(lambda x: (x[0], str(x[1])), conf.items())) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f1575ae4d3..7c04ed9017 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -378,12 +378,10 @@ def test_remote_timeline_client_calls_started_metric( test_name="test_remote_timeline_client_metrics", ) - env = neon_env_builder.init_start() - - # create tenant with config that will determinstically allow - # compaction and gc - tenant_id, timeline_id = env.neon_cli.create_tenant( - conf={ + # thinking about using a shared environment? the test assumes that global + # metrics are for single tenant. + env = neon_env_builder.init_start( + initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{128 * 1024}", "compaction_threshold": "1", @@ -398,6 +396,10 @@ def test_remote_timeline_client_calls_started_metric( } ) + tenant_id = env.initial_tenant + assert env.initial_timeline is not None + timeline_id: TimelineId = env.initial_timeline + client = env.pageserver.http_client() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) @@ -419,6 +421,7 @@ def test_remote_timeline_client_calls_started_metric( "VACUUM foo", ] ) + assert timeline_id is not None wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) calls_started: Dict[Tuple[str, str], List[int]] = { @@ -428,13 +431,14 @@ def test_remote_timeline_client_calls_started_metric( } def fetch_calls_started(): + assert timeline_id is not None for (file_kind, op_kind), observations in calls_started.items(): - val = client.get_remote_timeline_client_metric( - "pageserver_remote_timeline_client_calls_started_count", - tenant_id, - timeline_id, - file_kind, - op_kind, + val = client.get_metric_value( + name="pageserver_remote_timeline_client_calls_started_count", + filter={ + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, ) assert val is not None, f"expecting metric to be present: {file_kind} {op_kind}" val = int(val) @@ -518,12 +522,8 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( test_name="test_timeline_deletion_with_files_stuck_in_upload_queue", ) - env = neon_env_builder.init_start() - - # create tenant with config that will determinstically allow - # compaction and gc - tenant_id, timeline_id = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many operations "checkpoint_distance": f"{64 * 1024}", "compaction_threshold": "1", @@ -535,6 +535,10 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( "pitr_interval": "0s", } ) + tenant_id = env.initial_tenant + assert env.initial_timeline is not None + timeline_id: TimelineId = env.initial_timeline + timeline_path = env.timeline_dir(tenant_id, timeline_id) client = env.pageserver.http_client() @@ -787,12 +791,8 @@ def test_compaction_delete_before_upload( test_name="test_compaction_delete_before_upload", ) - env = neon_env_builder.init_start() - - # create tenant with config that will determinstically allow - # compaction and disables gc - tenant_id, timeline_id = env.neon_cli.create_tenant( - conf={ + env = neon_env_builder.init_start( + initial_tenant_conf={ # Set a small compaction threshold "compaction_threshold": "3", # Disable GC @@ -802,6 +802,10 @@ def test_compaction_delete_before_upload( } ) + tenant_id = env.initial_tenant + assert env.initial_timeline is not None + timeline_id: TimelineId = env.initial_timeline + client = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4dbfa8bc1f..35f7941bda 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -213,6 +213,9 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): # Test (a subset of) pageserver global metrics for metric in PAGESERVER_GLOBAL_METRICS: + if metric.startswith("pageserver_remote"): + continue + ps_samples = ps_metrics.query_all(metric, {}) assert len(ps_samples) > 0, f"expected at least one sample for {metric}" for sample in ps_samples: