diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 383c174684..dd37bfc407 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -388,6 +388,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'image_creation_check_threshold' as integer")?, + image_creation_preempt_threshold: settings + .remove("image_creation_preempt_threshold") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'image_creation_preempt_threshold' as integer")?, pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()), walreceiver_connect_timeout: settings .remove("walreceiver_connect_timeout") diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 422da0dc95..a0b5feea94 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -323,6 +323,10 @@ pub struct TenantConfigToml { // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, + // How many multiples of L0 `compaction_threshold` will preempt image layer creation and do L0 compaction. + // Set to 0 to disable preemption. + pub image_creation_preempt_threshold: usize, + /// The length for an explicit LSN lease request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] @@ -547,6 +551,10 @@ pub mod tenant_conf_defaults { // Relevant: https://github.com/neondatabase/neon/issues/3394 pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image + // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we + // want to enable this feature. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; @@ -605,6 +613,7 @@ impl Default for TenantConfigToml { lazy_slru_download: false, timeline_get_throttle: crate::models::ThrottleConfig::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + image_creation_preempt_threshold: DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD, lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, timeline_offloading: false, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 43447c67bd..19beb37ab3 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -498,6 +498,8 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub image_layer_creation_check_threshold: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub image_creation_preempt_threshold: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub lsn_lease_length_for_ts: FieldPatch, @@ -544,6 +546,7 @@ pub struct TenantConfig { pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, + pub image_creation_preempt_threshold: Option, pub lsn_lease_length: Option, pub lsn_lease_length_for_ts: Option, pub timeline_offloading: Option, @@ -581,6 +584,7 @@ impl TenantConfig { mut lazy_slru_download, mut timeline_get_throttle, mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, @@ -635,6 +639,9 @@ impl TenantConfig { patch .image_layer_creation_check_threshold .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); patch.lsn_lease_length.apply(&mut lsn_lease_length); patch .lsn_lease_length_for_ts @@ -679,6 +686,7 @@ impl TenantConfig { lazy_slru_download, timeline_get_throttle, image_layer_creation_check_threshold, + image_creation_preempt_threshold, lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1914a95562..80a61eba92 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5486,6 +5486,9 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), + image_creation_preempt_threshold: Some( + tenant_conf.image_creation_preempt_threshold, + ), lsn_lease_length: Some(tenant_conf.lsn_lease_length), lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), timeline_offloading: Some(tenant_conf.timeline_offloading), diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 139ed27bd2..972837dc44 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -357,6 +357,9 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_creation_preempt_threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] @@ -453,6 +456,9 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), + image_creation_preempt_threshold: self + .image_creation_preempt_threshold + .unwrap_or(global_conf.image_creation_preempt_threshold), lsn_lease_length: self .lsn_lease_length .unwrap_or(global_conf.lsn_lease_length), @@ -504,6 +510,7 @@ impl TenantConfOpt { mut lazy_slru_download, mut timeline_get_throttle, mut image_layer_creation_check_threshold, + mut image_creation_preempt_threshold, mut lsn_lease_length, mut lsn_lease_length_for_ts, mut timeline_offloading, @@ -578,6 +585,9 @@ impl TenantConfOpt { patch .image_layer_creation_check_threshold .apply(&mut image_layer_creation_check_threshold); + patch + .image_creation_preempt_threshold + .apply(&mut image_creation_preempt_threshold); patch .lsn_lease_length .map(|v| humantime::parse_duration(&v))? @@ -626,6 +636,7 @@ impl TenantConfOpt { lazy_slru_download, timeline_get_throttle, image_layer_creation_check_threshold, + image_creation_preempt_threshold, lsn_lease_length, lsn_lease_length_for_ts, timeline_offloading, @@ -689,6 +700,7 @@ impl From for models::TenantConfig { lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle, image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, + image_creation_preempt_threshold: value.image_creation_preempt_threshold, lsn_lease_length: value.lsn_lease_length.map(humantime), lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), timeline_offloading: value.timeline_offloading, diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 22d8b81bcc..7da51c27df 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -166,6 +166,10 @@ impl BatchLayerWriter { // END: catch every error and do the recovery in the above section Ok(generated_layers) } + + pub fn pending_layer_num(&self) -> usize { + self.generated_layer_writers.len() + } } /// An image writer that takes images and produces multiple image layers. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d6a8eaa4d9..d65b382e50 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -189,6 +189,14 @@ pub enum ImageLayerCreationMode { Initial, } +#[derive(Clone, Debug, Default)] +pub enum LastImageLayerCreationStatus { + Incomplete, // TODO: record the last key being processed + Complete, + #[default] + Initial, +} + impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?}", self) @@ -347,6 +355,8 @@ pub struct Timeline { // garbage collecting data that is still needed by the child timelines. pub(crate) gc_info: std::sync::RwLock, + pub(crate) last_image_layer_creation_status: ArcSwap, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -936,9 +946,16 @@ pub(crate) enum ShutdownMode { Hard, } -struct ImageLayerCreationOutcome { - unfinished_image_layer: Option, - next_start_key: Key, +enum ImageLayerCreationOutcome { + /// We generated an image layer + Generated { + unfinished_image_layer: ImageLayerWriter, + }, + /// The key range is empty + Empty, + /// (Only used in metadata image layer creation), after reading the metadata keys, we decide to skip + /// the image layer creation. + Skip, } /// Public interface functions @@ -2349,6 +2366,18 @@ impl Timeline { ) } + fn get_image_creation_preempt_threshold(&self) -> usize { + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_creation_preempt_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_creation_preempt_threshold, + ) + } + /// Resolve the effective WAL receiver protocol to use for this tenant. /// /// Priority order is: @@ -2499,6 +2528,10 @@ impl Timeline { gc_info: std::sync::RwLock::new(GcInfo::default()), + last_image_layer_creation_status: ArcSwap::new(Arc::new( + LastImageLayerCreationStatus::default(), + )), + latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -4042,15 +4075,20 @@ impl Timeline { } let mut layers_to_upload = Vec::new(); - layers_to_upload.extend( - self.create_image_layers( + let (generated_image_layers, is_complete) = self + .create_image_layers( &partitions, self.initdb_lsn, ImageLayerCreationMode::Initial, ctx, + LastImageLayerCreationStatus::Initial, ) - .await?, + .await?; + debug_assert!( + matches!(is_complete, LastImageLayerCreationStatus::Complete), + "init image generation mode must fully cover the keyspace" ); + layers_to_upload.extend(generated_image_layers); (layers_to_upload, None) } else { @@ -4370,7 +4408,6 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, img_range: Range, - start: Key, io_concurrency: IoConcurrency, ) -> Result { let mut wrote_keys = false; @@ -4458,26 +4495,23 @@ impl Timeline { lsn }, ); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: Some(image_layer_writer), - next_start_key: img_range.end, + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } /// Create an image layer for metadata keys. This function produces one image layer for all metadata /// keys for now. Because metadata keys cannot exceed basebackup size limit, the image layer for it /// would not be too large to fit in a single image layer. + /// + /// Creating image layers for metadata keys are different from relational keys. Firstly, instead of + /// iterating each key and get an image for each of them, we do a `vectored_get` scan over the sparse + /// keyspace to get all images in one run. Secondly, we use a different image layer generation metrics + /// for metadata keys than relational keys, which is the number of delta files visited during the scan. #[allow(clippy::too_many_arguments)] async fn create_image_layer_for_metadata_keys( self: &Arc, @@ -4487,12 +4521,13 @@ impl Timeline { ctx: &RequestContext, img_range: Range, mode: ImageLayerCreationMode, - start: Key, io_concurrency: IoConcurrency, ) -> Result { // Metadata keys image layer creation. let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); let begin = Instant::now(); + // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should + // not contain too many keys, otherwise this takes a lot of memory. let data = self .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) .await?; @@ -4517,10 +4552,7 @@ impl Timeline { ); if !trigger_generation && mode == ImageLayerCreationMode::Try { - return Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: img_range.end, - }); + return Ok(ImageLayerCreationOutcome::Skip); } if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); @@ -4551,20 +4583,12 @@ impl Timeline { lsn } ); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: Some(image_layer_writer), - next_start_key: img_range.end, + Ok(ImageLayerCreationOutcome::Generated { + unfinished_image_layer: image_layer_writer, }) } else { - // Special case: the image layer may be empty if this is a sharded tenant and the - // partition does not cover any keys owned by this shard. In this case, to ensure - // we don't leave gaps between image layers, leave `start` where it is, so that the next - // layer we write will cover the key range that we just scanned. tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); - Ok(ImageLayerCreationOutcome { - unfinished_image_layer: None, - next_start_key: start, - }) + Ok(ImageLayerCreationOutcome::Empty) } } @@ -4620,6 +4644,8 @@ impl Timeline { decision } + /// Returns the image layers generated and an enum indicating whether the process is fully completed. + /// true = we have generate all image layers, false = we preempt the process for L0 compaction. #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4627,7 +4653,8 @@ impl Timeline { lsn: Lsn, mode: ImageLayerCreationMode, ctx: &RequestContext, - ) -> Result, CreateImageLayersError> { + last_status: LastImageLayerCreationStatus, + ) -> Result<(Vec, LastImageLayerCreationStatus), CreateImageLayersError> { let timer = self.metrics.create_images_time_histo.start_timer(); // We need to avoid holes between generated image layers. @@ -4641,10 +4668,23 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = self.should_check_if_image_layers_required(lsn); + let check_for_image_layers = if let LastImageLayerCreationStatus::Incomplete = last_status { + info!( + "resuming image layer creation: last_status={:?}", + last_status + ); + true + } else { + self.should_check_if_image_layers_required(lsn) + }; let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + let mut all_generated = true; + + let mut partition_processed = 0; + let total_partitions = partitioning.parts.len(); + for partition in partitioning.parts.iter() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); @@ -4717,17 +4757,13 @@ impl Timeline { .map_err(|_| CreateImageLayersError::Cancelled)?, ); - let ImageLayerCreationOutcome { - unfinished_image_layer, - next_start_key, - } = if !compact_metadata { + let outcome = if !compact_metadata { self.create_image_layer_for_rel_blocks( partition, image_layer_writer, lsn, ctx, img_range.clone(), - start, io_concurrency, ) .await? @@ -4739,18 +4775,58 @@ impl Timeline { ctx, img_range.clone(), mode, - start, io_concurrency, ) .await? }; - start = next_start_key; - if let Some(unfinished_image_layer) = unfinished_image_layer { - batch_image_writer.add_unfinished_image_writer( + match outcome { + ImageLayerCreationOutcome::Empty => { + // No data in this partition, so we don't need to create an image layer (for now). + // The next image layer should cover this key range, so we don't advance the `start` + // key. + } + ImageLayerCreationOutcome::Generated { unfinished_image_layer, - img_range, - lsn, - ); + } => { + batch_image_writer.add_unfinished_image_writer( + unfinished_image_layer, + img_range.clone(), + lsn, + ); + // The next image layer should be generated right after this one. + start = img_range.end; + } + ImageLayerCreationOutcome::Skip => { + // We don't need to create an image layer for this partition. + // The next image layer should NOT cover this range, otherwise + // the keyspace becomes empty (reads don't go past image layers). + start = img_range.end; + } + } + + partition_processed += 1; + + if let ImageLayerCreationMode::Try = mode { + // We have at least made some progress + if batch_image_writer.pending_layer_num() >= 1 { + // The `Try` mode is currently only used on the compaction path. We want to avoid + // image layer generation taking too long time and blocking L0 compaction. So in this + // mode, we also inspect the current number of L0 layers and skip image layer generation + // if there are too many of them. + let num_of_l0_layers = { + let layers = self.layers.read().await; + layers.layer_map()?.level0_deltas().len() + }; + let image_preempt_threshold = self.get_image_creation_preempt_threshold() + * self.get_compaction_threshold(); + if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold { + tracing::info!( + "preempt image layer generation at {start} at {lsn}: too many L0 layers {num_of_l0_layers}", + ); + all_generated = false; + break; + } + } } } @@ -4765,14 +4841,35 @@ impl Timeline { .open_mut()? .track_new_image_layers(&image_layers, &self.metrics); drop_wlock(guard); - timer.stop_and_record(); + let duration = timer.stop_and_record(); // Creating image layers may have caused some previously visible layers to be covered if !image_layers.is_empty() { self.update_layer_visibility().await?; } - Ok(image_layers) + let total_layer_size = image_layers + .iter() + .map(|l| l.metadata().file_size) + .sum::(); + + info!( + "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions", + image_layers.len(), + total_layer_size, + duration.as_secs_f64(), + partition_processed, + total_partitions + ); + + Ok(( + image_layers, + if all_generated { + LastImageLayerCreationStatus::Complete + } else { + LastImageLayerCreationStatus::Incomplete + }, + )) } /// Wait until the background initial logical size calculation is complete, or diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 7244e946cb..9bd61bbac5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,7 +11,7 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, - RecordedDuration, Timeline, + LastImageLayerCreationStatus, RecordedDuration, Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -709,7 +709,7 @@ impl Timeline { .extend(sparse_partitioning.into_dense().parts); // 3. Create new image layers for partitions that have been modified "enough". - let image_layers = self + let (image_layers, outcome) = self .create_image_layers( &partitioning, lsn, @@ -722,10 +722,22 @@ impl Timeline { ImageLayerCreationMode::Try }, &image_ctx, + self.last_image_layer_creation_status + .load() + .as_ref() + .clone(), ) .await?; + self.last_image_layer_creation_status + .store(Arc::new(outcome.clone())); + self.upload_new_image_layers(image_layers)?; + if let LastImageLayerCreationStatus::Incomplete = outcome { + // Yield and do not do any other kind of compaction. + info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction)."); + return Ok(true); + } partitioning.parts.len() } Err(err) => { @@ -3232,11 +3244,7 @@ impl TimelineAdaptor { ranges: self.get_keyspace(key_range, lsn, ctx).await?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly - let start = Key::MIN; - let ImageLayerCreationOutcome { - unfinished_image_layer, - next_start_key: _, - } = self + let outcome = self .timeline .create_image_layer_for_rel_blocks( &keyspace, @@ -3244,13 +3252,15 @@ impl TimelineAdaptor { lsn, ctx, key_range.clone(), - start, IoConcurrency::sequential(), ) .await?; - if let Some(image_layer_writer) = unfinished_image_layer { - let (desc, path) = image_layer_writer.finish(ctx).await?; + if let ImageLayerCreationOutcome::Generated { + unfinished_image_layer, + } = outcome + { + let (desc, path) = unfinished_image_layer.finish(ctx).await?; let image_layer = Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; self.new_images.push(image_layer); diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index e88d245c8f..a4b9eabf8e 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -184,6 +184,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "gc_compaction_enabled": True, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, + "image_creation_preempt_threshold": 5, } vps_http = env.storage_controller.pageserver_api()