From db5384e1b0dd7a2b6d73f169a76317976a8a8959 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 30 Mar 2025 15:14:04 +0200 Subject: [PATCH] pageserver: remove L0 flush upload wait (#11196) ## Problem Previously, L0 flushes would wait for uploads, as a simple form of backpressure. However, this prevented flush pipelining and upload parallelism. It has since been disabled by default and replaced by L0 compaction backpressure. Touches https://github.com/neondatabase/cloud/issues/24664. ## Summary of changes This patch removes L0 flush upload waits, along with the `l0_flush_wait_upload`. This can't be merged until the setting has been removed across the fleet. --- control_plane/src/pageserver.rs | 5 --- libs/pageserver_api/src/config.rs | 9 ------ libs/pageserver_api/src/models.rs | 11 ------- pageserver/src/metrics.rs | 25 +-------------- pageserver/src/tenant/timeline.rs | 31 +------------------ test_runner/fixtures/metrics.py | 1 - .../regress/test_attach_tenant_config.py | 1 - 7 files changed, 2 insertions(+), 81 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 172e00e8bd..b39acbca4d 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -428,11 +428,6 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?, - l0_flush_wait_upload: settings - .remove("l0_flush_wait_upload") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?, l0_flush_stall_threshold: settings .remove("l0_flush_stall_threshold") .map(|x| x.parse::()) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 0d39a287c9..47c3136113 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -285,12 +285,6 @@ pub struct TenantConfigToml { /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. pub l0_flush_stall_threshold: Option, - /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next - /// layer. This is a temporary backpressure mechanism which should be removed once - /// l0_flush_{delay,stall}_threshold is fully enabled. - /// - /// TODO: this is no longer enabled, remove it when the config option is no longer set. - pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. // The unit is #of bytes of WAL. @@ -579,8 +573,6 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. @@ -627,7 +619,6 @@ impl Default for TenantConfigToml { compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE, l0_flush_delay_threshold: None, l0_flush_stall_threshold: None, - l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD, gc_horizon: DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period"), diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ca9faad6f5..0a7c9717ca 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -523,8 +523,6 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub l0_flush_stall_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] - pub l0_flush_wait_upload: FieldPatch, - #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_horizon: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_period: FieldPatch, @@ -614,9 +612,6 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub l0_flush_stall_threshold: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub l0_flush_wait_upload: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub gc_horizon: Option, @@ -712,7 +707,6 @@ impl TenantConfig { mut compaction_l0_semaphore, mut l0_flush_delay_threshold, mut l0_flush_stall_threshold, - mut l0_flush_wait_upload, mut gc_horizon, mut gc_period, mut image_creation_threshold, @@ -765,7 +759,6 @@ impl TenantConfig { patch .l0_flush_stall_threshold .apply(&mut l0_flush_stall_threshold); - patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload); patch.gc_horizon.apply(&mut gc_horizon); patch .gc_period @@ -844,7 +837,6 @@ impl TenantConfig { compaction_l0_semaphore, l0_flush_delay_threshold, l0_flush_stall_threshold, - l0_flush_wait_upload, gc_horizon, gc_period, image_creation_threshold, @@ -911,9 +903,6 @@ impl TenantConfig { l0_flush_stall_threshold: self .l0_flush_stall_threshold .or(global_conf.l0_flush_stall_threshold), - l0_flush_wait_upload: self - .l0_flush_wait_upload - .unwrap_or(global_conf.l0_flush_wait_upload), gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon), gc_period: self.gc_period.unwrap_or(global_conf.gc_period), image_creation_threshold: self diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index f7afaae068..9820d50e7b 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -10,7 +10,7 @@ use std::time::{Duration, Instant}; use enum_map::{Enum as _, EnumMap}; use futures::Future; use metrics::{ - Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, + Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, @@ -499,15 +499,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::n .expect("failed to define a metric") }); -static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { - register_gauge_vec!( - "pageserver_flush_wait_upload_seconds", - "Time spent waiting for preceding uploads during layer flush", - &["tenant_id", "shard_id", "timeline_id"] - ) - .expect("failed to define a metric") -}); - static LAST_RECORD_LSN: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_last_record_lsn", @@ -2864,7 +2855,6 @@ pub(crate) struct TimelineMetrics { timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub flush_delay_histo: StorageTimeMetrics, - pub flush_wait_upload_time_gauge: Gauge, pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, @@ -2916,9 +2906,6 @@ impl TimelineMetrics { &shard_id, &timeline_id, ); - let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) - .unwrap(); let compact_time_histo = StorageTimeMetrics::new( StorageTimeOperation::Compact, &tenant_id, @@ -3046,7 +3033,6 @@ impl TimelineMetrics { timeline_id, flush_time_histo, flush_delay_histo, - flush_wait_upload_time_gauge, compact_time_histo, create_images_time_histo, logical_size_histo, @@ -3096,14 +3082,6 @@ impl TimelineMetrics { self.resident_physical_size_gauge.get() } - pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) { - self.flush_wait_upload_time_gauge.add(duration); - crate::metrics::FLUSH_WAIT_UPLOAD_TIME - .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id]) - .unwrap() - .add(duration); - } - /// Generates TIMELINE_LAYER labels for a persistent layer. fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] { let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) { @@ -3207,7 +3185,6 @@ impl TimelineMetrics { let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]); - let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a4b59603d1..75f9225302 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -84,8 +84,8 @@ use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; +use super::remote_timeline_client::RemoteTimelineClient; use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; -use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; @@ -2562,14 +2562,6 @@ impl Timeline { Some(max(l0_flush_stall_threshold, compaction_threshold)) } - fn get_l0_flush_wait_upload(&self) -> bool { - let tenant_conf = self.tenant_conf.load(); - tenant_conf - .tenant_conf - .l0_flush_wait_upload - .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload) - } - fn get_image_creation_threshold(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -4591,27 +4583,6 @@ impl Timeline { // release lock on 'layers' }; - // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files. - // This makes us refuse ingest until the new layers have been persisted to the remote - // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead. - if self.get_l0_flush_wait_upload() { - let start = Instant::now(); - self.remote_client - .wait_completion() - .await - .map_err(|e| match e { - WaitCompletionError::UploadQueueShutDownOrStopped - | WaitCompletionError::NotInitialized( - NotInitialized::ShuttingDown | NotInitialized::Stopped, - ) => FlushLayerError::Cancelled, - WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => { - FlushLayerError::Other(anyhow!(e).into()) - } - })?; - let duration = start.elapsed().as_secs_f64(); - self.metrics.flush_wait_upload_time_gauge_add(duration); - } - // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`, // a compaction can delete the file and then it won't be available for uploads any more. // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 54e6458ac6..7cf60f2ab2 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -168,7 +168,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( "pageserver_evictions_with_low_residence_duration_total", "pageserver_aux_file_estimated_size", "pageserver_valid_lsn_lease_count", - "pageserver_flush_wait_upload_seconds", counter("pageserver_tenant_throttling_count_accounted_start"), counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 8360072a2d..8568bec8b2 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -145,7 +145,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m",