diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 75bea9dbab..cc444c479a 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_read_num_fs_layers", + "Number of persistent layers accessed for processing a read request, including those in the cache", + &["tenant_id", "timeline_id"], + vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + ) + .expect("failed to define a metric") +}); + // Metrics collected on operations on the storage repository. static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { register_histogram_vec!( @@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_materialized_cache_hits_direct_total", + "Number of cache hits from materialized page cache without redo", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_getpage_get_reconstruct_data_seconds", + "Time spent in get_reconstruct_value_data", + &["tenant_id", "timeline_id"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric") +}); + static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_materialized_cache_hits_total", @@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ 0.001000, // 1000 usec 0.030, // 30 ms 1.000, // 1000 ms + 30.000, // 30000 ms ]; const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[ @@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", - "Time spent waiting for access to the WAL redo process", + "Time spent waiting for access to the Postgres WAL redo process", redo_histogram_time_buckets!(), ) .expect("failed to define a metric") @@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo in the Postgres WAL redo process", redo_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_bytes_histogram", - "Histogram of number of records replayed per redo", + "Histogram of number of records replayed per redo sent to Postgres", redo_bytes_histogram_count_buckets!(), ) .expect("failed to define a metric") @@ -723,7 +753,9 @@ pub struct TimelineMetrics { tenant_id: String, timeline_id: String, pub reconstruct_time_histo: Histogram, + pub get_reconstruct_data_time_histo: Histogram, pub materialized_page_cache_hit_counter: GenericCounter, + pub materialized_page_cache_hit_upon_request_counter: GenericCounter, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, @@ -734,6 +766,7 @@ pub struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub resident_physical_size_gauge: UIntGauge, + pub read_num_fs_layers: Histogram, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub num_persistent_files_created: IntCounter, @@ -753,6 +786,9 @@ impl TimelineMetrics { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); @@ -794,6 +830,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let read_num_fs_layers = READ_NUM_FS_LAYERS + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); + let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); @@ -801,7 +843,9 @@ impl TimelineMetrics { tenant_id, timeline_id, reconstruct_time_histo, + get_reconstruct_data_time_histo, materialized_page_cache_hit_counter, + materialized_page_cache_hit_upon_request_counter, flush_time_histo, compact_time_histo, create_images_time_histo, @@ -819,6 +863,7 @@ impl TimelineMetrics { evictions_with_low_residence_duration: std::sync::RwLock::new( evictions_with_low_residence_duration, ), + read_num_fs_layers, } } } @@ -828,7 +873,9 @@ impl Drop for TimelineMetrics { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]); + let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]); + let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]); let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]); @@ -836,6 +883,8 @@ impl Drop for TimelineMetrics { let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]); let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]); let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]); + let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]); + self.evictions_with_low_residence_duration .write() .unwrap() diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 36c4b0bcd4..8885e761a2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -525,7 +525,12 @@ impl Timeline { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Equal => { + self.metrics + .materialized_page_cache_hit_upon_request_counter + .inc(); + return Ok(cached_img); // exact LSN match, return the image + } Ordering::Greater => { unreachable!("the returned lsn should never be after the requested lsn") } @@ -540,8 +545,10 @@ impl Timeline { img: cached_page_img, }; + let timer = self.metrics.get_reconstruct_data_time_histo.start_timer(); self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; + timer.stop_and_record(); self.metrics .reconstruct_time_histo @@ -2261,6 +2268,9 @@ impl Timeline { let mut timeline_owned; let mut timeline = self; + let mut read_count = + scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64)); + // For debugging purposes, collect the path of layers that we traversed // through. It's included in the error message if we fail to find the key. let mut traversal_path = Vec::::new(); @@ -2395,6 +2405,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2421,6 +2432,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + // metrics: open_layer does not count as fs access, so we are not updating `read_count` traversal_path.push(( result, cont_lsn, @@ -2455,6 +2467,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; + *read_count += 1; traversal_path.push(( result, cont_lsn, diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 0e958ddd06..b4c237cfa6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -65,12 +65,19 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_bucket", "pageserver_getpage_reconstruct_seconds_count", "pageserver_getpage_reconstruct_seconds_sum", + "pageserver_getpage_get_reconstruct_data_seconds_bucket", + "pageserver_getpage_get_reconstruct_data_seconds_count", + "pageserver_getpage_get_reconstruct_data_seconds_sum", "pageserver_io_operations_bytes_total", "pageserver_io_operations_seconds_bucket", "pageserver_io_operations_seconds_count", "pageserver_io_operations_seconds_sum", "pageserver_last_record_lsn", "pageserver_materialized_cache_hits_total", + "pageserver_materialized_cache_hits_direct_total", + "pageserver_read_num_fs_layers_bucket", + "pageserver_read_num_fs_layers_count", + "pageserver_read_num_fs_layers_sum", "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum",