mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 12:02:55 +00:00
storcon: ignore deleted timelines on new location catch-up (#9244)
## Problem If a timeline was deleted right before waiting for LSNs to catch up before the cut-over, then we would wait forever. ## Summary of changes Fix the issue and add a test for timeline deletions mid migration. Related https://github.com/neondatabase/neon/issues/9144
This commit is contained in:
@@ -454,7 +454,7 @@ impl Reconciler {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -469,10 +469,7 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Expected timeline isn't yet visible on migration destination.
|
||||
// (IRL we would have to account for timeline deletion, but this
|
||||
// is just test helper)
|
||||
any_behind = true;
|
||||
// Timeline was deleted in the meantime - ignore it
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -481,7 +478,7 @@ impl Reconciler {
|
||||
tracing::info!("✅ LSN caught up. Proceeding...");
|
||||
break;
|
||||
} else {
|
||||
std::thread::sleep(Duration::from_millis(500));
|
||||
tokio::time::sleep(Duration::from_millis(500)).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -562,6 +559,8 @@ impl Reconciler {
|
||||
self.location_config(&dest_ps, dest_conf, None, false)
|
||||
.await?;
|
||||
|
||||
pausable_failpoint!("reconciler-live-migrate-pre-await-lsn");
|
||||
|
||||
if let Some(baseline) = baseline_lsns {
|
||||
tracing::info!("🕑 Waiting for LSN to catch up...");
|
||||
self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
|
||||
|
||||
@@ -2613,6 +2613,9 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
|
||||
class MigrationFailpoints(Enum):
|
||||
# While only the origin is attached
|
||||
PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc"
|
||||
# While only the origin is attached and the db was updated to
|
||||
# point to the new location
|
||||
PRE_AWAIT_LSN = "reconciler-live-migrate-pre-await-lsn"
|
||||
# While both locations are attached
|
||||
POST_NOTIFY = "reconciler-live-migrate-post-notify"
|
||||
# While only the destination is attached
|
||||
@@ -2807,3 +2810,77 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
|
||||
attached_to = shard["node_attached"]
|
||||
expected_az = env.get_pageserver(attached_to).az_id
|
||||
assert shard["preferred_az_id"] == expected_az
|
||||
|
||||
|
||||
@run_only_on_default_postgres("Postgres version makes no difference here")
|
||||
@pytest.mark.parametrize(
|
||||
"migration_failpoint",
|
||||
[
|
||||
MigrationFailpoints.PRE_GENERATION_INC,
|
||||
MigrationFailpoints.PRE_AWAIT_LSN,
|
||||
MigrationFailpoints.POST_NOTIFY,
|
||||
MigrationFailpoints.POST_DETACH,
|
||||
],
|
||||
)
|
||||
def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, migration_failpoint):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1})
|
||||
env.storage_controller.pageserver_api().timeline_create(
|
||||
pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
|
||||
)
|
||||
|
||||
shard_zero = TenantShardId(tenant_id, 0, 0)
|
||||
locations = env.storage_controller.get_tenants_placement()[str(shard_zero)]
|
||||
|
||||
assert locations["observed"] == locations["intent"]
|
||||
assert locations["observed"]["attached"] is not None
|
||||
assert len(locations["observed"]["secondary"]) > 0
|
||||
|
||||
attached_location = locations["observed"]["attached"]
|
||||
secondary_location = locations["observed"]["secondary"][0]
|
||||
|
||||
env.storage_controller.configure_failpoints((migration_failpoint.value, "pause"))
|
||||
|
||||
try:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
migrate_fut = executor.submit(
|
||||
env.storage_controller.tenant_shard_migrate,
|
||||
shard_zero,
|
||||
secondary_location,
|
||||
)
|
||||
|
||||
def has_hit_migration_failpoint():
|
||||
expr = f"at failpoint {migration_failpoint.value}"
|
||||
log.info(expr)
|
||||
assert env.storage_controller.log_contains(expr)
|
||||
|
||||
wait_until(10, 1, has_hit_migration_failpoint)
|
||||
|
||||
env.storage_controller.pageserver_api().timeline_delete(
|
||||
tenant_id=tenant_id, timeline_id=timeline_id
|
||||
)
|
||||
|
||||
# Eventually migration completes
|
||||
env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
|
||||
migrate_fut.result()
|
||||
|
||||
# Ensure that we detached from the old attached location
|
||||
with pytest.raises(PageserverApiException) as exc:
|
||||
env.get_pageserver(attached_location).http_client().timeline_list(tenant_id)
|
||||
assert exc.value.status_code == 404
|
||||
|
||||
# Ensure the timeline is not present on the new attached location
|
||||
client = env.get_pageserver(secondary_location).http_client()
|
||||
assert timeline_id not in {
|
||||
TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)
|
||||
}, f"deleted timeline found on {secondary_location}"
|
||||
|
||||
except:
|
||||
# Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
|
||||
env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
|
||||
raise
|
||||
|
||||
Reference in New Issue
Block a user