From 62d77e263f2b3f4b6847b6a9a14c319da6cfbfa4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 27 Feb 2024 10:55:10 +0100 Subject: [PATCH] test_remote_timeline_client_calls_started_metric: fix flakiness (#6911) fixes https://github.com/neondatabase/neon/issues/6889 # Problem The failure in the last 3 flaky runs on `main` is ``` test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric churn("a", "b") test_runner/regress/test_remote_storage.py:457: in churn assert gc_result["layers_removed"] > 0 E assert 0 > 0 ``` That's this code https://github.com/neondatabase/neon/blob/cd449d66ea29ad2d7269458e90623c3ae40e1816/test_runner/regress/test_remote_storage.py#L448-L460 So, the test expects GC to remove some layers but the GC doesn't. # Fix My impression is that the VACUUM isn't re-using pages aggressively enough, but I can't really prove that. Tried to analyze the layer map dump but it's too complex. So, this PR: - Creates more churn by doing the overwrite twice. - Forces image layer creation. It also drive-by removes the redundant call to timeline_compact, because, timeline_checkpoint already does that internally. --- pageserver/src/http/routes.rs | 8 ++++++++ pageserver/src/tenant/timeline.rs | 8 +++++++- test_runner/fixtures/pageserver/http.py | 6 ++++++ test_runner/regress/test_remote_storage.py | 16 ++++++++++------ 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1339229a70..04211fbb7f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1616,6 +1616,10 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; @@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler( if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } + if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? { + flags |= CompactFlags::ForceImageLayerCreation; + } + async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f09617849c..b14eafa194 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority { #[derive(enumset::EnumSetType)] pub(crate) enum CompactFlags { ForceRepartition, + ForceImageLayerCreation, } impl std::fmt::Debug for Timeline { @@ -1157,7 +1158,12 @@ impl Timeline { // 3. Create new image layers for partitions that have been modified // "enough". let layers = self - .create_image_layers(&partitioning, lsn, false, &image_ctx) + .create_image_layers( + &partitioning, + lsn, + flags.contains(CompactFlags::ForceImageLayerCreation), + &image_ctx, + ) .await .map_err(anyhow::Error::from)?; if let Some(remote_client) = &self.remote_client { diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 427ef00c78..ad3efb5837 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") res = self.put( @@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, force_repartition=False, + force_image_layer_creation=False, ): self.is_testing_enabled_or_skip() query = {} if force_repartition: query["force_repartition"] = "true" + if force_image_layer_creation: + query["force_image_layer_creation"] = "true" log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") res = self.put( diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 176a5e57dc..73ebe0a76f 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric( initial_tenant_conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{128 * 1024}", + # ensure each timeline_checkpoint() calls creates L1s "compaction_threshold": "1", "compaction_target_size": f"{128 * 1024}", # no PITR horizon, we specify the horizon when we request on-demand GC @@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric( # disable background compaction and GC. We invoke it manually when we want it to happen. "gc_period": "0s", "compaction_period": "0s", - # create image layers eagerly, so that GC can remove some layers - "image_creation_threshold": "1", } ) @@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric( ), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}" def churn(data_pass1, data_pass2): + # overwrite the same data in place, vacuum inbetween, and + # and create image layers; then run a gc(). + # this should + # - create new layers + # - delete some layers overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) - client.timeline_checkpoint(tenant_id, timeline_id) - client.timeline_compact(tenant_id, timeline_id) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1) + overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2) + client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True) gc_result = client.timeline_gc(tenant_id, timeline_id, 0) print_gc_result(gc_result) assert gc_result["layers_removed"] > 0