pageserver: check for new image layers based on ingested WAL (#7230)

## Problem
Part of the legacy (but current) compaction algorithm is to find a stack
of overlapping delta layers which will be turned
into an image layer. This operation is exponential in terms of the
number of matching layers and we do it roughly every 20 seconds.

## Summary of changes
Only check if a new image layer is required if we've ingested a certain
amount of WAL since the last check.
The amount of wal is expressed in terms of multiples of checkpoint
distance, with the intuition being that
that there's little point doing the check if we only have two new L1
layers (not enough to create a new image).
This commit is contained in:
Vlad Lazar
2024-03-28 17:44:55 +00:00
committed by GitHub
parent 39d1818ae9
commit 090123a429
11 changed files with 69 additions and 1 deletions

View File

@@ -389,6 +389,10 @@ impl PageServerNode {
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
@@ -501,6 +505,12 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
image_layer_creation_check_threshold: settings
.remove("image_layer_creation_check_threshold")
.map(|x| x.parse::<u8>())
.transpose()
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")

View File

@@ -301,6 +301,7 @@ pub struct TenantConfig {
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]

View File

@@ -3653,6 +3653,9 @@ pub(crate) mod harness {
heatmap_period: Some(tenant_conf.heatmap_period),
lazy_slru_download: Some(tenant_conf.lazy_slru_download),
timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
}
}
}

View File

@@ -57,6 +57,9 @@ pub mod defaults {
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
// By default ingest enough WAL for two new L0 layers before checking if new image
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
@@ -362,6 +365,10 @@ pub struct TenantConf {
pub lazy_slru_download: bool,
pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
// How much WAL must be ingested before checking again whether a new image layer is required.
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -454,6 +461,9 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
}
impl TenantConfOpt {
@@ -508,6 +518,9 @@ impl TenantConfOpt {
.timeline_get_throttle
.clone()
.unwrap_or(global_conf.timeline_get_throttle),
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
}
}
}
@@ -548,6 +561,7 @@ impl Default for TenantConf {
heatmap_period: Duration::ZERO,
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
}
}
}
@@ -621,6 +635,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
}
}
}

View File

@@ -309,6 +309,8 @@ pub struct Timeline {
/// Configuration: how often should the partitioning be recalculated.
repartition_threshold: u64,
last_image_layer_creation_check_at: AtomicLsn,
/// Current logical size of the "datadir", at the last LSN.
current_logical_size: LogicalSize,
@@ -1632,6 +1634,15 @@ impl Timeline {
.unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
}
fn get_image_layer_creation_check_threshold(&self) -> u8 {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
tenant_conf.image_layer_creation_check_threshold.unwrap_or(
self.conf
.default_tenant_conf
.image_layer_creation_check_threshold,
)
}
pub(super) fn tenant_conf_updated(&self) {
// NB: Most tenant conf options are read by background loops, so,
// changes will automatically be picked up.
@@ -1769,6 +1780,7 @@ impl Timeline {
},
partitioning: tokio::sync::Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
last_image_layer_creation_check_at: AtomicLsn::new(0),
last_received_wal: Mutex::new(None),
rel_size_cache: RwLock::new(HashMap::new()),
@@ -1797,6 +1809,7 @@ impl Timeline {
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
result
.metrics
.last_record_gauge
@@ -3501,6 +3514,24 @@ impl Timeline {
// Is it time to create a new image layer for the given partition?
async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
let last = self.last_image_layer_creation_check_at.load();
if lsn != Lsn(0) {
let distance = lsn
.checked_sub(last)
.expect("Attempt to compact with LSN going backwards");
let min_distance = self.get_image_layer_creation_check_threshold() as u64
* self.get_checkpoint_distance();
// Skip the expensive delta layer counting below if we've not ingested
// sufficient WAL since the last check.
if distance.0 < min_distance {
return false;
}
}
self.last_image_layer_creation_check_at.store(lsn);
let threshold = self.get_image_creation_threshold();
let guard = self.layers.read().await;

View File

@@ -189,6 +189,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
},
"trace_read_requests": True,
"walreceiver_connect_timeout": "13m",
"image_layer_creation_check_threshold": 1,
}
ps_http = env.pageserver.http_client()

View File

@@ -165,6 +165,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
"compaction_threshold": "3",
# "image_creation_threshold": set at runtime
"compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
"image_layer_creation_check_threshold": "0", # always check if a new image layer can be created
}
def tenant_update_config(changes):

View File

@@ -53,6 +53,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
"checkpoint_timeout": "24h", # something we won't reach
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
"image_creation_threshold": "100", # we want to control when image is created
"image_layer_creation_check_threshold": "0",
"compaction_threshold": f"{l0_l1_threshold}",
"compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
}

View File

@@ -568,6 +568,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
"image_creation_threshold": 100,
# repartitioning parameter, unused
"compaction_target_size": 128 * 1024**2,
# Always check if a new image layer can be created
"image_layer_creation_check_threshold": 0,
# pitr_interval and gc_horizon are not interesting because we dont run gc
}
@@ -632,7 +634,8 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
# threshold to expose image creation to downloading all of the needed
# layers -- threshold of 2 would sound more reasonable, but keeping it as 1
# to be less flaky
env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
conf["image_creation_threshold"] = "1"
env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()})
pageserver_http.timeline_compact(tenant_id, timeline_id)
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)

View File

@@ -53,6 +53,7 @@ TENANT_CONF = {
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
"image_layer_creation_check_threshold": "0",
}

View File

@@ -245,6 +245,7 @@ def test_remote_storage_upload_queue_retries(
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
"image_layer_creation_check_threshold": "0",
}
)