diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 1e7d7adffe..4864a021fe 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -454,7 +454,7 @@ impl Reconciler { Ok(l) => l, Err(e) => { tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; continue; } }; @@ -469,10 +469,7 @@ impl Reconciler { } } None => { - // Expected timeline isn't yet visible on migration destination. - // (IRL we would have to account for timeline deletion, but this - // is just test helper) - any_behind = true; + // Timeline was deleted in the meantime - ignore it } } } @@ -481,7 +478,7 @@ impl Reconciler { tracing::info!("✅ LSN caught up. Proceeding..."); break; } else { - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; } } @@ -562,6 +559,8 @@ impl Reconciler { self.location_config(&dest_ps, dest_conf, None, false) .await?; + pausable_failpoint!("reconciler-live-migrate-pre-await-lsn"); + if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 789623cb27..23a9ef58df 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2613,6 +2613,9 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB class MigrationFailpoints(Enum): # While only the origin is attached PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc" + # While only the origin is attached and the db was updated to + # point to the new location + PRE_AWAIT_LSN = "reconciler-live-migrate-pre-await-lsn" # While both locations are attached POST_NOTIFY = "reconciler-live-migrate-post-notify" # While only the destination is attached @@ -2807,3 +2810,77 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): attached_to = shard["node_attached"] expected_az = env.get_pageserver(attached_to).az_id assert shard["preferred_az_id"] == expected_az + + +@run_only_on_default_postgres("Postgres version makes no difference here") +@pytest.mark.parametrize( + "migration_failpoint", + [ + MigrationFailpoints.PRE_GENERATION_INC, + MigrationFailpoints.PRE_AWAIT_LSN, + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ], +) +def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, migration_failpoint): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + + shard_zero = TenantShardId(tenant_id, 0, 0) + locations = env.storage_controller.get_tenants_placement()[str(shard_zero)] + + assert locations["observed"] == locations["intent"] + assert locations["observed"]["attached"] is not None + assert len(locations["observed"]["secondary"]) > 0 + + attached_location = locations["observed"]["attached"] + secondary_location = locations["observed"]["secondary"][0] + + env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + shard_zero, + secondary_location, + ) + + def has_hit_migration_failpoint(): + expr = f"at failpoint {migration_failpoint.value}" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(10, 1, has_hit_migration_failpoint) + + env.storage_controller.pageserver_api().timeline_delete( + tenant_id=tenant_id, timeline_id=timeline_id + ) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + migrate_fut.result() + + # Ensure that we detached from the old attached location + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(attached_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + + # Ensure the timeline is not present on the new attached location + client = env.get_pageserver(secondary_location).http_client() + assert timeline_id not in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"deleted timeline found on {secondary_location}" + + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + raise