test_remote_timeline_client_calls_started_metric: fix flakiness (#6911)

fixes https://github.com/neondatabase/neon/issues/6889

# Problem

The failure in the last 3 flaky runs on `main` is 

```
test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric
    churn("a", "b")
test_runner/regress/test_remote_storage.py:457: in churn
    assert gc_result["layers_removed"] > 0
E   assert 0 > 0
```

That's this code


cd449d66ea/test_runner/regress/test_remote_storage.py (L448-L460)

So, the test expects GC to remove some layers but the GC doesn't.

# Fix

My impression is that the VACUUM isn't re-using pages aggressively
enough, but I can't really prove that. Tried to analyze the layer map
dump but it's too complex.

So, this PR:

- Creates more churn by doing the overwrite twice.
- Forces image layer creation.

It also drive-by removes the redundant call to timeline_compact,
because, timeline_checkpoint already does that internally.
This commit is contained in:
Christian Schwarz
2024-02-27 10:55:10 +01:00
committed by GitHub
parent b2bbc20311
commit 62d77e263f
4 changed files with 31 additions and 7 deletions

View File

@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;

View File

@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
#[derive(enumset::EnumSetType)]
pub(crate) enum CompactFlags {
ForceRepartition,
ForceImageLayerCreation,
}
impl std::fmt::Debug for Timeline {
@@ -1157,7 +1158,12 @@ impl Timeline {
// 3. Create new image layers for partitions that have been modified
// "enough".
let layers = self
.create_image_layers(&partitioning, lsn, false, &image_ctx)
.create_image_layers(
&partitioning,
lsn,
flags.contains(CompactFlags::ForceImageLayerCreation),
&image_ctx,
)
.await
.map_err(anyhow::Error::from)?;
if let Some(remote_client) = &self.remote_client {

View File

@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
):
self.is_testing_enabled_or_skip()
query = {}
if force_repartition:
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
force_repartition=False,
force_image_layer_creation=False,
):
self.is_testing_enabled_or_skip()
query = {}
if force_repartition:
query["force_repartition"] = "true"
if force_image_layer_creation:
query["force_image_layer_creation"] = "true"
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(

View File

@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
initial_tenant_conf={
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
# ensure each timeline_checkpoint() calls creates L1s
"compaction_threshold": "1",
"compaction_target_size": f"{128 * 1024}",
# no PITR horizon, we specify the horizon when we request on-demand GC
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
}
)
@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
def churn(data_pass1, data_pass2):
# overwrite the same data in place, vacuum inbetween, and
# and create image layers; then run a gc().
# this should
# - create new layers
# - delete some layers
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
client.timeline_checkpoint(tenant_id, timeline_id)
client.timeline_compact(tenant_id, timeline_id)
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
client.timeline_checkpoint(tenant_id, timeline_id)
client.timeline_compact(tenant_id, timeline_id)
client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0