From d91d018afa2f402473690beb5c2157a63b86879d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 22 Jul 2025 12:04:03 +0100
Subject: [PATCH] storcon: handle pageserver disk loss (#12667)

NB: effectively a no-op in the neon env since the handling is config
gated
in storcon

## Problem

When a pageserver suffers from a local disk/node failure and restarts,
the storage controller will receive a re-attach call and return all the
tenants the pageserver is suppose to attach, but the pageserver will not
act on any tenants that it doesn't know about locally. As a result, the
pageserver will not rehydrate any tenants from remote storage if it
restarted following a local disk loss, while the storage controller
still thinks that the pageserver have all the tenants attached. This
leaves the system in a bad state, and the symptom is that PG's
pageserver connections will fail with "tenant not found" errors.

## Summary of changes

Made a slight change to the storage controller's `re_attach` API:
* The pageserver will set an additional bit `empty_local_disk` in the
reattach request, indicating whether it has started with an empty disk
or does not know about any tenants.
* Upon receiving the reattach request, if this `empty_local_disk` bit is
set, the storage controller will go ahead and clear all observed
locations referencing the pageserver. The reconciler will then discover
the discrepancy between the intended state and observed state of the
tenant and take care of the situation.

To facilitate rollouts this extra behavior in the `re_attach` API is
guarded by the `handle_ps_local_disk_loss` command line flag of the
storage controller.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
---
 control_plane/src/bin/neon_local.rs           |  7 +++
 control_plane/src/storage_controller.rs       |  6 +++
 libs/pageserver_api/src/upcall_api.rs         |  8 ++++
 pageserver/src/controller_upcall_client.rs    |  3 ++
 pageserver/src/deletion_queue.rs              |  1 +
 pageserver/src/tenant/mgr.rs                  |  3 +-
 storage_controller/src/main.rs                |  5 ++
 storage_controller/src/service.rs             | 30 ++++++++++++
 test_runner/fixtures/neon_cli.py              |  5 ++
 test_runner/fixtures/neon_fixtures.py         | 10 +++-
 .../regress/test_hcc_handling_ps_data_loss.py | 47 +++++++++++++++++++
 11 files changed, 122 insertions(+), 3 deletions(-)
 create mode 100644 test_runner/regress/test_hcc_handling_ps_data_loss.py
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index efc135ed91..e036e9d44b 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -407,6 +407,12 @@ struct StorageControllerStartCmdArgs {
         help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
     )]
     base_port: Option<u16>,
+
+    #[clap(
+        long,
+        help = "Whether the storage controller should handle pageserver-reported local disk loss events."
+    )]
+    handle_ps_local_disk_loss: Option<bool>,
 }
 
 #[derive(clap::Args)]
@@ -1809,6 +1815,7 @@ async fn handle_storage_controller(
                 instance_id: args.instance_id,
                 base_port: args.base_port,
                 start_timeout: args.start_timeout,
+                handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
             };
 
             if let Err(e) = svc.start(start_args).await {
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index f996f39967..35a197112e 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -56,6 +56,7 @@ pub struct NeonStorageControllerStartArgs {
     pub instance_id: u8,
     pub base_port: Option<u16>,
     pub start_timeout: humantime::Duration,
+    pub handle_ps_local_disk_loss: Option<bool>,
 }
 
 impl NeonStorageControllerStartArgs {
@@ -64,6 +65,7 @@ impl NeonStorageControllerStartArgs {
             instance_id: 1,
             base_port: None,
             start_timeout,
+            handle_ps_local_disk_loss: None,
         }
     }
 }
@@ -669,6 +671,10 @@ impl StorageController {
 
         println!("Starting storage controller at {scheme}://{host}:{listen_port}");
 
+        if start_args.handle_ps_local_disk_loss.unwrap_or_default() {
+            args.push("--handle-ps-local-disk-loss".to_string());
+        }
+
         background_process::start_process(
             COMMAND,
             &instance_dir,
diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs
index 07cada2eb1..fa2c896edb 100644
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -21,6 +21,14 @@ pub struct ReAttachRequest {
     /// if the node already has a node_id set.
     #[serde(skip_serializing_if = "Option::is_none", default)]
     pub register: Option<NodeRegisterRequest>,
+
+    /// Hadron: Optional flag to indicate whether the node is starting with an empty local disk.
+    /// Will be set to true if the node couldn't find any local tenant data on startup, could be
+    /// due to the node starting for the first time or due to a local SSD failure/disk wipe event.
+    /// The flag may be used by the storage controller to update its observed state of the world
+    /// to make sure that it sends explicit location_config calls to the node following the
+    /// re-attach request.
+    pub empty_local_disk: Option<bool>,
 }
 
 #[derive(Serialize, Deserialize, Debug)]
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 8da4cee4b9..96829bd6ea 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -42,6 +42,7 @@ pub trait StorageControllerUpcallApi {
     fn re_attach(
         &self,
         conf: &PageServerConf,
+        empty_local_disk: bool,
     ) -> impl Future<
         Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
     > + Send;
@@ -155,6 +156,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
     async fn re_attach(
         &self,
         conf: &PageServerConf,
+        empty_local_disk: bool,
     ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
         let url = self
             .base_url
@@ -226,6 +228,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
         let request = ReAttachRequest {
             node_id: self.node_id,
             register: register.clone(),
+            empty_local_disk: Some(empty_local_disk),
         };
 
         let response: ReAttachResponse = self
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 7854fd9e36..51581ccc2c 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -768,6 +768,7 @@ mod test {
         async fn re_attach(
             &self,
             _conf: &PageServerConf,
+            _empty_local_disk: bool,
         ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
             unimplemented!()
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9b196ae393..b47bab16d8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -352,7 +352,8 @@ async fn init_load_generations(
         let client = StorageControllerUpcallClient::new(conf, cancel);
         info!("Calling {} API to re-attach tenants", client.base_url());
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach(conf).await {
+        let empty_local_disk = tenant_confs.is_empty();
+        match client.re_attach(conf, empty_local_disk).await {
             Ok(tenants) => tenants
                 .into_iter()
                 .flat_map(|(id, rart)| {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 5d21feeb10..34d4ac6fba 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -225,6 +225,10 @@ struct Cli {
 
     #[arg(long)]
     shard_split_request_timeout: Option<humantime::Duration>,
+
+    /// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss.
+    #[arg(long, default_value = "false")]
+    handle_ps_local_disk_loss: bool,
 }
 
 enum StrictMode {
@@ -477,6 +481,7 @@ async fn async_main() -> anyhow::Result<()> {
             .shard_split_request_timeout
             .map(humantime::Duration::into)
             .unwrap_or(Duration::MAX),
+        handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 71186076ec..8f5efe8ac4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -487,6 +487,9 @@ pub struct Config {
 
     /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
     pub shard_split_request_timeout: Duration,
+
+    // Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss.
+    pub handle_ps_local_disk_loss: bool,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -2388,6 +2391,33 @@ impl Service {
             tenants: Vec::new(),
         };
 
+        // [Hadron] If the pageserver reports in the reattach message that it has an empty disk, it's possible that it just
+        // recovered from a local disk failure. The response of the reattach request will contain a list of tenants but it
+        // will not be honored by the pageserver in this case (disk failure). We should make sure we clear any observed
+        // locations of tenants attached to the node so that the reconciler will discover the discrpancy and reconfigure the
+        // missing tenants on the node properly.
+        if self.config.handle_ps_local_disk_loss && reattach_req.empty_local_disk.unwrap_or(false) {
+            tracing::info!(
+                "Pageserver {node_id} reports empty local disk, clearing observed locations referencing the pageserver for all tenants",
+                node_id = reattach_req.node_id
+            );
+            let mut num_tenant_shards_affected = 0;
+            for (tenant_shard_id, shard) in tenants.iter_mut() {
+                if shard
+                    .observed
+                    .locations
+                    .remove(&reattach_req.node_id)
+                    .is_some()
+                {
+                    tracing::info!("Cleared observed location for tenant shard {tenant_shard_id}");
+                    num_tenant_shards_affected += 1;
+                }
+            }
+            tracing::info!(
+                "Cleared observed locations for {num_tenant_shards_affected} tenant shards"
+            );
+        }
+
         // TODO: cancel/restart any running reconciliation for this tenant, it might be trying
         // to call location_conf API with an old generation.  Wait for cancellation to complete
         // before responding to this request.  Requires well implemented CancellationToken logic
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index f33d4a0d22..5ad00d155e 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli):
         timeout_in_seconds: int | None = None,
         instance_id: int | None = None,
         base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
     ):
         cmd = ["storage_controller", "start"]
         if timeout_in_seconds is not None:
@@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli):
             cmd.append(f"--instance-id={instance_id}")
         if base_port is not None:
             cmd.append(f"--base-port={base_port}")
+        if handle_ps_local_disk_loss is not None:
+            cmd.append(
+                f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}"
+            )
         return self.raw_cli(cmd)
 
     def storage_controller_stop(self, immediate: bool, instance_id: int | None = None):
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1ce34a2c4e..eb7f826873 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
         timeout_in_seconds: int | None = None,
         instance_id: int | None = None,
         base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
     ) -> Self:
         assert not self.running
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
         self.running = True
         return self
 
@@ -2838,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController):
         timeout_in_seconds: int | None = None,
         instance_id: int | None = None,
         base_port: int | None = None,
+        handle_ps_local_disk_loss: bool | None = None,
     ) -> Self:
         assert instance_id is not None and base_port is not None
 
-        self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port)
+        self.env.neon_cli.storage_controller_start(
+            timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss
+        )
         self.instances[instance_id] = {"running": True}
 
         self.running = True
diff --git a/test_runner/regress/test_hcc_handling_ps_data_loss.py b/test_runner/regress/test_hcc_handling_ps_data_loss.py
new file mode 100644
index 0000000000..35d3b72923
--- /dev/null
+++ b/test_runner/regress/test_hcc_handling_ps_data_loss.py
@@ -0,0 +1,47 @@
+import shutil
+
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.utils import query_scalar
+
+
+def test_hcc_handling_ps_data_loss(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that following a pageserver local data loss event, the system can recover automatically (i.e.
+    rehydrating the restarted pageserver from remote storage) without manual intervention. The
+    pageserver indicates to the storage controller that it has restarted without any local tenant
+    data in its "reattach" request and the storage controller uses this information to detect the
+    data loss condition and reconfigure the pageserver as necessary.
+    """
+    env = neon_env_builder.init_configs()
+    env.broker.start()
+    env.storage_controller.start(handle_ps_local_disk_loss=True)
+    env.pageserver.start()
+    for sk in env.safekeepers:
+        sk.start()
+
+    # create new nenant
+    tenant_id, _ = env.create_tenant(shard_count=4)
+
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+        cur.execute("CREATE DATABASE testdb")
+
+    with endpoint.cursor(dbname="testdb") as cur:
+        cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)")
+    endpoint.stop()
+
+    # Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver
+    # that restarted with the same ID but has lost all its local disk data.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(env.pageserver.tenant_dir())
+    env.pageserver.start()
+
+    # Test that the endpoint can start and query the database after the pageserver restarts. This
+    # indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote
+    # storage automatically.
+    endpoint.start()
+    with endpoint.cursor(dbname="testdb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100