storcon: do timeline creation on all attached location (#9237)

## Problem Creation of a timelines during a reconciliation can lead to unavailability if the user attempts to start a compute before the storage controller has notified cplane of the cut-over. ## Summary of changes Create timelines on all currently attached locations. For the latest location, we still look at the database (this is a previously). With this change we also look into the observed state to find *other* attached locations. Related https://github.com/neondatabase/neon/issues/9144
2026-01-10 15:02:56 +00:00 · 2024-10-04 11:56:43 +01:00
parent 37158d0424
commit dcf7af5a16
3 changed files with 271 additions and 33 deletions
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -526,6 +526,21 @@ pub(crate) enum ReconcileResultRequest {
    Stop,
 }

+#[derive(Clone)]
+struct MutationLocation {
+    node: Node,
+    generation: Generation,
+}
+
+#[derive(Clone)]
+struct ShardMutationLocations {
+    latest: MutationLocation,
+    other: Vec<MutationLocation>,
+}
+
+#[derive(Default, Clone)]
+struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
+
 impl Service {
    pub fn get_config(&self) -> &Config {
        &self.config
@@ -2987,38 +3002,83 @@ impl Service {
        failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");

        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
-            if targets.is_empty() {
+            if targets.0.is_empty() {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant not found").into(),
                ));
            };
-            let shard_zero = targets.remove(0);
+
+            let (shard_zero_tid, shard_zero_locations) =
+                targets.0.pop_first().expect("Must have at least one shard");
+            assert!(shard_zero_tid.is_shard_zero());

            async fn create_one(
                tenant_shard_id: TenantShardId,
-                node: Node,
+                locations: ShardMutationLocations,
                jwt: Option<String>,
                create_req: TimelineCreateRequest,
            ) -> Result<TimelineInfo, ApiError> {
+                let latest = locations.latest.node;
+
                tracing::info!(
-                    "Creating timeline on shard {}/{}, attached to node {node}",
+                    "Creating timeline on shard {}/{}, attached to node {latest} in generation {:?}",
                    tenant_shard_id,
                    create_req.new_timeline_id,
+                    locations.latest.generation
                );
-                let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());

-                client
+                let client =
+                    PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref());
+
+                let timeline_info = client
                    .timeline_create(tenant_shard_id, &create_req)
                    .await
-                    .map_err(|e| passthrough_api_error(&node, e))
+                    .map_err(|e| passthrough_api_error(&latest, e))?;
+
+                // We propagate timeline creations to all attached locations such that a compute
+                // for the new timeline is able to start regardless of the current state of the
+                // tenant shard reconciliation.
+                for location in locations.other {
+                    tracing::info!(
+                        "Creating timeline on shard {}/{}, stale attached to node {} in generation {:?}",
+                        tenant_shard_id,
+                        create_req.new_timeline_id,
+                        location.node,
+                        location.generation
+                    );
+
+                    let client = PageserverClient::new(
+                        location.node.get_id(),
+                        location.node.base_url(),
+                        jwt.as_deref(),
+                    );
+
+                    let res = client
+                        .timeline_create(tenant_shard_id, &create_req)
+                        .await;
+
+                    if let Err(e) = res {
+                        match e {
+                            mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => {
+                                // Tenant might have been detached from the stale location,
+                                // so ignore 404s.
+                            },
+                            _ => {
+                                return Err(passthrough_api_error(&location.node, e));
+                            }
+                        }
+                    }
+                }
+
+                Ok(timeline_info)
            }

            // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then
            // use whatever LSN that shard picked when creating on subsequent shards.  We arbitrarily use shard zero as the shard
            // that will get the first creation request, and propagate the LSN to all the >0 shards.
            let timeline_info = create_one(
-                shard_zero.0,
-                shard_zero.1,
+                shard_zero_tid,
+                shard_zero_locations,
                self.config.jwt_token.clone(),
                create_req.clone(),
            )
@@ -3031,14 +3091,24 @@ impl Service {
            }

            // Create timeline on remaining shards with number >0
-            if !targets.is_empty() {
+            if !targets.0.is_empty() {
                // If we had multiple shards, issue requests for the remainder now.
                let jwt = &self.config.jwt_token;
                self.tenant_for_shards(
-                    targets.iter().map(|t| (t.0, t.1.clone())).collect(),
-                    |tenant_shard_id: TenantShardId, node: Node| {
+                    targets
+                        .0
+                        .iter()
+                        .map(|t| (*t.0, t.1.latest.node.clone()))
+                        .collect(),
+                    |tenant_shard_id: TenantShardId, _node: Node| {
                        let create_req = create_req.clone();
-                        Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req))
+                        let mutation_locations = targets.0.remove(&tenant_shard_id).unwrap();
+                        Box::pin(create_one(
+                            tenant_shard_id,
+                            mutation_locations,
+                            jwt.clone(),
+                            create_req,
+                        ))
                    },
                )
                .await?;
@@ -3068,7 +3138,7 @@ impl Service {
        .await;

        self.tenant_remote_mutation(tenant_id, move |targets| async move {
-            if targets.is_empty() {
+            if targets.0.is_empty() {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant not found").into(),
                ));
@@ -3099,8 +3169,9 @@ impl Service {

            // no shard needs to go first/last; the operation should be idempotent
            // TODO: it would be great to ensure that all shards return the same error
+            let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
            let results = self
-                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                .tenant_for_shards(locations, |tenant_shard_id, node| {
                    futures::FutureExt::boxed(config_one(
                        tenant_shard_id,
                        timeline_id,
@@ -3131,7 +3202,7 @@ impl Service {
        .await;

        self.tenant_remote_mutation(tenant_id, move |targets| async move {
-            if targets.is_empty() {
+            if targets.0.is_empty() {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant not found").into(),
                ));
@@ -3179,8 +3250,9 @@ impl Service {
            }

            // no shard needs to go first/last; the operation should be idempotent
+            let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
            let mut results = self
-                .tenant_for_shards(targets, |tenant_shard_id, node| {
+                .tenant_for_shards(locations, |tenant_shard_id, node| {
                    futures::FutureExt::boxed(detach_one(
                        tenant_shard_id,
                        timeline_id,
@@ -3227,7 +3299,7 @@ impl Service {
        .await;

        self.tenant_remote_mutation(tenant_id, move |targets| async move {
-            if targets.is_empty() {
+            if targets.0.is_empty() {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant not found").into(),
                ));
@@ -3249,7 +3321,12 @@ impl Service {
            }

            // no shard needs to go first/last; the operation should be idempotent
-            self.tenant_for_shards(targets, |tenant_shard_id, node| {
+            let locations = targets
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            self.tenant_for_shards(locations, |tenant_shard_id, node| {
                futures::FutureExt::boxed(do_one(
                    tenant_shard_id,
                    timeline_id,
@@ -3344,11 +3421,11 @@ impl Service {
        op: O,
    ) -> Result<R, ApiError>
    where
-        O: FnOnce(Vec<(TenantShardId, Node)>) -> F,
+        O: FnOnce(TenantMutationLocations) -> F,
        F: std::future::Future<Output = R>,
    {
-        let target_gens = {
-            let mut targets = Vec::new();
+        let mutation_locations = {
+            let mut locations = TenantMutationLocations::default();

            // Load the currently attached pageservers for the latest generation of each shard.  This can
            // run concurrently with reconciliations, and it is not guaranteed that the node we find here
@@ -3399,14 +3476,50 @@ impl Service {
                    .ok_or(ApiError::Conflict(format!(
                        "Raced with removal of node {node_id}"
                    )))?;
-                targets.push((tenant_shard_id, node.clone(), generation));
+                let generation = generation.expect("Checked above");
+
+                let tenant = locked.tenants.get(&tenant_shard_id);
+
+                // TODO(vlad): Abstract the logic that finds stale attached locations
+                // from observed state into a [`Service`] method.
+                let other_locations = match tenant {
+                    Some(tenant) => {
+                        let mut other = tenant.attached_locations();
+                        let latest_location_index =
+                            other.iter().position(|&l| l == (node.get_id(), generation));
+                        if let Some(idx) = latest_location_index {
+                            other.remove(idx);
+                        }
+
+                        other
+                    }
+                    None => Vec::default(),
+                };
+
+                let location = ShardMutationLocations {
+                    latest: MutationLocation {
+                        node: node.clone(),
+                        generation,
+                    },
+                    other: other_locations
+                        .into_iter()
+                        .filter_map(|(node_id, generation)| {
+                            let node = locked.nodes.get(&node_id)?;
+
+                            Some(MutationLocation {
+                                node: node.clone(),
+                                generation,
+                            })
+                        })
+                        .collect(),
+                };
+                locations.0.insert(tenant_shard_id, location);
            }

-            targets
+            locations
        };

-        let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect();
-        let result = op(targets).await;
+        let result = op(mutation_locations.clone()).await;

        // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
        // our remote operation executed on the latest generation and is therefore persistent.
@@ -3422,9 +3535,10 @@ impl Service {
                     }| (tenant_shard_id, generation),
                )
                .collect::<Vec<_>>()
-                != target_gens
+                != mutation_locations
+                    .0
                    .into_iter()
-                    .map(|i| (i.0, i.2))
+                    .map(|i| (i.0, Some(i.1.latest.generation)))
                    .collect::<Vec<_>>()
            {
                // We raced with something that incremented the generation, and therefore cannot be
@@ -3454,12 +3568,14 @@ impl Service {
        .await;

        self.tenant_remote_mutation(tenant_id, move |mut targets| async move {
-            if targets.is_empty() {
+            if targets.0.is_empty() {
                return Err(ApiError::NotFound(
                    anyhow::anyhow!("Tenant not found").into(),
                ));
            }
-            let shard_zero = targets.remove(0);
+
+            let (shard_zero_tid, shard_zero_locations) = targets.0.pop_first().expect("Must have at least one shard");
+            assert!(shard_zero_tid.is_shard_zero());

            async fn delete_one(
                tenant_shard_id: TenantShardId,
@@ -3482,8 +3598,9 @@ impl Service {
                    })
            }

+            let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
            let statuses = self
-                .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| {
+                .tenant_for_shards(locations, |tenant_shard_id: TenantShardId, node: Node| {
                    Box::pin(delete_one(
                        tenant_shard_id,
                        timeline_id,
@@ -3501,9 +3618,9 @@ impl Service {
            // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed
            // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done.
            let shard_zero_status = delete_one(
-                shard_zero.0,
+                shard_zero_tid,
                timeline_id,
-                shard_zero.1,
+                shard_zero_locations.latest.node,
                self.config.jwt_token.clone(),
            )
            .await?;
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -17,6 +17,7 @@ use crate::{
    service::ReconcileResultRequest,
 };
 use futures::future::{self, Either};
+use itertools::Itertools;
 use pageserver_api::controller_api::{
    AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
 };
@@ -1410,6 +1411,32 @@ impl TenantShard {
    pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) {
        self.preferred_az_id = Some(preferred_az_id);
    }
+
+    /// Returns all the nodes to which this tenant shard is attached according to the
+    /// observed state and the generations. Return vector is sorted from latest generation
+    /// to earliest.
+    pub(crate) fn attached_locations(&self) -> Vec<(NodeId, Generation)> {
+        self.observed
+            .locations
+            .iter()
+            .filter_map(|(node_id, observed)| {
+                use LocationConfigMode::{AttachedMulti, AttachedSingle, AttachedStale};
+
+                let conf = observed.conf.as_ref()?;
+
+                match (conf.generation, conf.mode) {
+                    (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => {
+                        Some((*node_id, gen))
+                    }
+                    _ => None,
+                }
+            })
+            .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| {
+                lhs_gen.cmp(rhs_gen).reverse()
+            })
+            .map(|(node_id, gen)| (node_id, Generation::new(gen)))
+            .collect()
+    }
 }

 #[cfg(test)]
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2911,3 +2911,97 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
        env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
        raise
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+@pytest.mark.parametrize(
+    "migration_failpoint",
+    [
+        MigrationFailpoints.PRE_GENERATION_INC,
+        MigrationFailpoints.POST_NOTIFY,
+        MigrationFailpoints.POST_DETACH,
+    ],
+)
+def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migration_failpoint):
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    env.start()
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1})
+
+    shard_zero = TenantShardId(tenant_id, 0, 0)
+    locations = env.storage_controller.get_tenants_placement()[str(shard_zero)]
+
+    assert locations["observed"] == locations["intent"]
+    assert locations["observed"]["attached"] is not None
+    assert len(locations["observed"]["secondary"]) > 0
+
+    attached_location = locations["observed"]["attached"]
+    secondary_location = locations["observed"]["secondary"][0]
+
+    env.storage_controller.configure_failpoints((migration_failpoint.value, "pause"))
+
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+            migrate_fut = executor.submit(
+                env.storage_controller.tenant_shard_migrate,
+                shard_zero,
+                secondary_location,
+            )
+
+            def has_hit_migration_failpoint():
+                expr = f"at failpoint {migration_failpoint.value}"
+                log.info(expr)
+                assert env.storage_controller.log_contains(expr)
+
+            wait_until(10, 1, has_hit_migration_failpoint)
+
+            timeline_id = TimelineId.generate()
+            env.storage_controller.pageserver_api().timeline_create(
+                pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
+            )
+
+            # Timeline creation only goes to the origin.
+            if migration_failpoint == MigrationFailpoints.PRE_GENERATION_INC:
+                client = env.get_pageserver(attached_location).http_client()
+                assert timeline_id in {
+                    TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)
+                }, f"new timeline not found on {attached_location}"
+
+                with pytest.raises(PageserverApiException) as exc:
+                    env.get_pageserver(secondary_location).http_client().timeline_list(tenant_id)
+                assert exc.value.status_code == 404
+
+            # Timeline creations goes to both attached locations
+            if migration_failpoint == MigrationFailpoints.POST_NOTIFY:
+                for node_id in [attached_location, secondary_location]:
+                    client = env.get_pageserver(node_id).http_client()
+                    assert timeline_id in {
+                        TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)
+                    }, f"new timeline not found on {node_id}"
+
+            # Timeline creation goes both locations, but storcon gets a 404 from the origin
+            # which it ignores.
+            if migration_failpoint == MigrationFailpoints.POST_DETACH:
+                client = env.get_pageserver(secondary_location).http_client()
+                assert timeline_id in {
+                    TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id)
+                }, f"new timeline not found on {attached_location}"
+
+                with pytest.raises(PageserverApiException) as exc:
+                    env.get_pageserver(attached_location).http_client().timeline_list(tenant_id)
+                assert exc.value.status_code == 404
+
+            # Eventually migration completes
+            env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+            migrate_fut.result()
+
+            # Ensure that we detached from the old attached location
+            with pytest.raises(PageserverApiException) as exc:
+                env.get_pageserver(attached_location).http_client().timeline_list(tenant_id)
+            assert exc.value.status_code == 404
+    except:
+        # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
+        env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
+        raise