storcon: ignore deleted timelines on new location catch-up (#9244)

## Problem If a timeline was deleted right before waiting for LSNs to catch up before the cut-over, then we would wait forever. ## Summary of changes Fix the issue and add a test for timeline deletions mid migration. Related https://github.com/neondatabase/neon/issues/9144
2026-01-04 12:02:55 +00:00 · 2024-10-02 17:23:26 +01:00
parent f875e107aa
commit 8dbfda98d4
2 changed files with 82 additions and 6 deletions
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -454,7 +454,7 @@ impl Reconciler {
                Ok(l) => l,
                Err(e) => {
                    tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",);
-                    std::thread::sleep(Duration::from_millis(500));
+                    tokio::time::sleep(Duration::from_millis(500)).await;
                    continue;
                }
            };
@@ -469,10 +469,7 @@ impl Reconciler {
                        }
                    }
                    None => {
-                        // Expected timeline isn't yet visible on migration destination.
-                        // (IRL we would have to account for timeline deletion, but this
-                        //  is just test helper)
-                        any_behind = true;
+                        // Timeline was deleted in the meantime - ignore it
                    }
                }
            }
@@ -481,7 +478,7 @@ impl Reconciler {
                tracing::info!("✅ LSN caught up.  Proceeding...");
                break;
            } else {
-                std::thread::sleep(Duration::from_millis(500));
+                tokio::time::sleep(Duration::from_millis(500)).await;
            }
        }

@@ -562,6 +559,8 @@ impl Reconciler {
        self.location_config(&dest_ps, dest_conf, None, false)
            .await?;

+        pausable_failpoint!("reconciler-live-migrate-pre-await-lsn");
+
        if let Some(baseline) = baseline_lsns {
            tracing::info!("🕑 Waiting for LSN to catch up...");
            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2613,6 +2613,9 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
 class MigrationFailpoints(Enum):
    # While only the origin is attached
    PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc"
+    # While only the origin is attached and the db was updated to
+    # point to the new location
+    PRE_AWAIT_LSN = "reconciler-live-migrate-pre-await-lsn"
    # While both locations are attached
    POST_NOTIFY = "reconciler-live-migrate-post-notify"
    # While only the destination is attached
@@ -2807,3 +2810,77 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
        attached_to = shard["node_attached"]
        expected_az = env.get_pageserver(attached_to).az_id
        assert shard["preferred_az_id"] == expected_az
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+@pytest.mark.parametrize(
+    "migration_failpoint",
+    [
+        MigrationFailpoints.PRE_GENERATION_INC,
+        MigrationFailpoints.PRE_AWAIT_LSN,
+        MigrationFailpoints.POST_NOTIFY,
+        MigrationFailpoints.POST_DETACH,
+    ],
+)
+def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, migration_failpoint):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1})
+    env.storage_controller.pageserver_api().timeline_create(
+        pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+    )
+
+    shard_zero = TenantShardId(tenant_id, 0, 0)
+    locations = env.storage_controller.get_tenants_placement()[str(shard_zero)]
+
+    assert locations["observed"] == locations["intent"]
+    assert locations["observed"]["attached"] is not None
+    assert len(locations["observed"]["secondary"]) > 0
+
+    attached_location = locations["observed"]["attached"]
+    secondary_location = locations["observed"]["secondary"][0]
+
+    env.storage_controller.configure_failpoints((migration_failpoint.value, "pause"))
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                shard_zero,
+                secondary_location,
+            )
+
+            def has_hit_migration_failpoint():
+                expr = f"at failpoint {migration_failpoint.value}"
+                log.info(expr)
+                assert env.storage_controller.log_contains(expr)
+
+            wait_until(10, 1, has_hit_migration_failpoint)
+
+            env.storage_controller.pageserver_api().timeline_delete(
+                tenant_id=tenant_id, timeline_id=timeline_id
+            )
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+            migrate_fut.result()
+
+            # Ensure that we detached from the old attached location
+            with pytest.raises(PageserverApiException) as exc:
+                env.get_pageserver(attached_location).http_client().timeline_list(tenant_id)
+            assert exc.value.status_code == 404
+
+            # Ensure the timeline is not present on the new attached location
+            client = env.get_pageserver(secondary_location).http_client()
+            assert timeline_id not in {
+                TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)
+            }, f"deleted timeline found on {secondary_location}"
+
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+        raise