mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 13:32:57 +00:00
test_remote_timeline_client_calls_started_metric: fix flakiness (#6911)
fixes https://github.com/neondatabase/neon/issues/6889
# Problem
The failure in the last 3 flaky runs on `main` is
```
test_runner/regress/test_remote_storage.py:460: in test_remote_timeline_client_calls_started_metric
churn("a", "b")
test_runner/regress/test_remote_storage.py:457: in churn
assert gc_result["layers_removed"] > 0
E assert 0 > 0
```
That's this code
cd449d66ea/test_runner/regress/test_remote_storage.py (L448-L460)
So, the test expects GC to remove some layers but the GC doesn't.
# Fix
My impression is that the VACUUM isn't re-using pages aggressively
enough, but I can't really prove that. Tried to analyze the layer map
dump but it's too complex.
So, this PR:
- Creates more churn by doing the overwrite twice.
- Forces image layer creation.
It also drive-by removes the redundant call to timeline_compact,
because, timeline_checkpoint already does that internally.
This commit is contained in:
committed by
GitHub
parent
b2bbc20311
commit
62d77e263f
@@ -1616,6 +1616,10 @@ async fn timeline_compact_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||
flags |= CompactFlags::ForceRepartition;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
@@ -1642,6 +1646,10 @@ async fn timeline_checkpoint_handler(
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
|
||||
flags |= CompactFlags::ForceRepartition;
|
||||
}
|
||||
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
|
||||
flags |= CompactFlags::ForceImageLayerCreation;
|
||||
}
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
|
||||
|
||||
@@ -503,6 +503,7 @@ pub enum GetLogicalSizePriority {
|
||||
#[derive(enumset::EnumSetType)]
|
||||
pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
ForceImageLayerCreation,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -1157,7 +1158,12 @@ impl Timeline {
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layers = self
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
.create_image_layers(
|
||||
&partitioning,
|
||||
lsn,
|
||||
flags.contains(CompactFlags::ForceImageLayerCreation),
|
||||
&image_ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
|
||||
@@ -549,11 +549,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
force_repartition=False,
|
||||
force_image_layer_creation=False,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
if force_repartition:
|
||||
query["force_repartition"] = "true"
|
||||
if force_image_layer_creation:
|
||||
query["force_image_layer_creation"] = "true"
|
||||
|
||||
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
@@ -608,11 +611,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
force_repartition=False,
|
||||
force_image_layer_creation=False,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
query = {}
|
||||
if force_repartition:
|
||||
query["force_repartition"] = "true"
|
||||
if force_image_layer_creation:
|
||||
query["force_image_layer_creation"] = "true"
|
||||
|
||||
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
|
||||
@@ -382,6 +382,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
initial_tenant_conf={
|
||||
# small checkpointing and compaction targets to ensure we generate many upload operations
|
||||
"checkpoint_distance": f"{128 * 1024}",
|
||||
# ensure each timeline_checkpoint() calls creates L1s
|
||||
"compaction_threshold": "1",
|
||||
"compaction_target_size": f"{128 * 1024}",
|
||||
# no PITR horizon, we specify the horizon when we request on-demand GC
|
||||
@@ -389,8 +390,6 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -449,12 +448,17 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
), f"observations for {file_kind} {op_kind} did not grow monotonically: {observations}"
|
||||
|
||||
def churn(data_pass1, data_pass2):
|
||||
# overwrite the same data in place, vacuum inbetween, and
|
||||
# and create image layers; then run a gc().
|
||||
# this should
|
||||
# - create new layers
|
||||
# - delete some layers
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id)
|
||||
client.timeline_compact(tenant_id, timeline_id)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass1)
|
||||
overwrite_data_and_wait_for_it_to_arrive_at_pageserver(data_pass2)
|
||||
client.timeline_checkpoint(tenant_id, timeline_id, force_image_layer_creation=True)
|
||||
gc_result = client.timeline_gc(tenant_id, timeline_id, 0)
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
Reference in New Issue
Block a user