From 02a83913eccd6488216139ddc151a06f354b4f1a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 12 Mar 2025 15:31:28 +0000
Subject: [PATCH] storcon: do not update observed state on node activation
 (#11155)

## Problem

When a node becomes active, we query its locations and update the
observed state in-place.
This can race with the observed state updates done when processing
reconcile results.

## Summary of changes

The argument for this reconciliation step is that is reduces the need
for background reconciliations.
I don't think is actually true anymore. There's two cases.

1. Restart of node after drain. Usually the node does not go through the
offline state here, so observed locations
were not marked as none. In any case, there should be a handful of
shards max on the node since we've just drained it.
2. Node comes back online after failure or network partition. When the
node is marked offline, we reschedule everything away from it. When it
later becomes active, the previous observed location is extraneous and
requires a reconciliation anyway.

Closes https://github.com/neondatabase/neon/issues/11148
---
 control_plane/src/bin/neon_local.rs           | 14 +++++++---
 control_plane/src/storage_controller.rs       |  7 ++++-
 storage_controller/src/service.rs             | 26 ++++++++++++++++---
 .../regress/test_storage_controller.py        | 11 +++++---
 test_runner/regress/test_tenant_size.py       |  2 +-
 5 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index ba1411b615..72ebbafd3b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -36,7 +36,9 @@ use pageserver_api::config::{
 use pageserver_api::controller_api::{
     NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest,
 };
-use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
@@ -1129,12 +1131,16 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
             let tenant_id = get_tenant_id(args.tenant_id, env)?;
             let tenant_conf: HashMap<_, _> =
                 args.config.iter().flat_map(|c| c.split_once(':')).collect();
+            let config = PageServerNode::parse_config(tenant_conf)?;
 
-            pageserver
-                .tenant_config(tenant_id, tenant_conf)
+            let req = TenantConfigRequest { tenant_id, config };
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .set_tenant_config(&req)
                 .await
                 .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
-            println!("tenant {tenant_id} successfully configured on the pageserver");
+            println!("tenant {tenant_id} successfully configured via storcon");
         }
     }
     Ok(())
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 439d7936a7..bbd7f67720 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -14,7 +14,7 @@ use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -878,4 +878,9 @@ impl StorageController {
         )
         .await
     }
+
+    pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> {
+        self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req))
+            .await
+    }
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 96b67fa81e..667b53b725 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2004,21 +2004,41 @@ impl Service {
         tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len());
 
         let mut cleanup = Vec::new();
+        let mut mismatched_locations = 0;
         {
             let mut locked = self.inner.write().unwrap();
 
-            for (tenant_shard_id, observed_loc) in configs.tenant_shards {
+            for (tenant_shard_id, reported) in configs.tenant_shards {
                 let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                     cleanup.push(tenant_shard_id);
                     continue;
                 };
-                tenant_shard
+
+                let on_record = &mut tenant_shard
                     .observed
                     .locations
-                    .insert(node.get_id(), ObservedStateLocation { conf: observed_loc });
+                    .entry(node.get_id())
+                    .or_insert_with(|| ObservedStateLocation { conf: None })
+                    .conf;
+
+                // If the location reported by the node does not match our observed state,
+                // then we mark it as uncertain and let the background reconciliation loop
+                // deal with it.
+                //
+                // Note that this also covers net new locations reported by the node.
+                if *on_record != reported {
+                    mismatched_locations += 1;
+                    *on_record = None;
+                }
             }
         }
 
+        if mismatched_locations > 0 {
+            tracing::info!(
+                "Set observed state to None for {mismatched_locations} mismatched locations"
+            );
+        }
+
         for tenant_shard_id in cleanup {
             tracing::info!("Detaching {tenant_shard_id}");
             match node
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 29919f2fe7..5eaf69cfa1 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1749,18 +1749,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
     # Restart the failed pageserver
     victim_ps.start()
 
+    env.storage_controller.reconcile_until_idle()
+
     # We expect that the re-attach call correctly tipped off the pageserver that its locations
     # are all secondaries now.
     locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"]
     assert len(locations) == 2
     assert all(loc[1]["mode"] == "Secondary" for loc in locations)
 
-    # We expect that this situation resulted from the re_attach call, and not any explicit
-    # Reconciler runs: assert that the reconciliation count has not gone up since we restarted.
+    # We expect that this situation resulted from background reconciliations
+    # Reconciler runs: assert that the reconciliation count has gone up by exactly
+    # one for each shard
     reconciles_after_restart = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
     )
-    assert reconciles_after_restart == reconciles_before_restart
+
+    assert reconciles_before_restart is not None
+    assert reconciles_after_restart == reconciles_before_restart + 2
 
 
 def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 713f89c60f..81e727a3aa 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows(
         # when our tenant is configured with a tiny pitr interval, dropping a table should
         # cause synthetic size to go down immediately
         tenant_config["pitr_interval"] = "0s"
-        env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
+        env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config)
         (current_lsn, size) = get_current_consistent_size(
             env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
         )