diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index efc135ed91..e036e9d44b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -407,6 +407,12 @@ struct StorageControllerStartCmdArgs { help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)" )] base_port: Option, + + #[clap( + long, + help = "Whether the storage controller should handle pageserver-reported local disk loss events." + )] + handle_ps_local_disk_loss: Option, } #[derive(clap::Args)] @@ -1809,6 +1815,7 @@ async fn handle_storage_controller( instance_id: args.instance_id, base_port: args.base_port, start_timeout: args.start_timeout, + handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, }; if let Err(e) = svc.start(start_args).await { diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f996f39967..35a197112e 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -56,6 +56,7 @@ pub struct NeonStorageControllerStartArgs { pub instance_id: u8, pub base_port: Option, pub start_timeout: humantime::Duration, + pub handle_ps_local_disk_loss: Option, } impl NeonStorageControllerStartArgs { @@ -64,6 +65,7 @@ impl NeonStorageControllerStartArgs { instance_id: 1, base_port: None, start_timeout, + handle_ps_local_disk_loss: None, } } } @@ -669,6 +671,10 @@ impl StorageController { println!("Starting storage controller at {scheme}://{host}:{listen_port}"); + if start_args.handle_ps_local_disk_loss.unwrap_or_default() { + args.push("--handle-ps-local-disk-loss".to_string()); + } + background_process::start_process( COMMAND, &instance_dir, diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 07cada2eb1..fa2c896edb 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -21,6 +21,14 @@ pub struct ReAttachRequest { /// if the node already has a node_id set. #[serde(skip_serializing_if = "Option::is_none", default)] pub register: Option, + + /// Hadron: Optional flag to indicate whether the node is starting with an empty local disk. + /// Will be set to true if the node couldn't find any local tenant data on startup, could be + /// due to the node starting for the first time or due to a local SSD failure/disk wipe event. + /// The flag may be used by the storage controller to update its observed state of the world + /// to make sure that it sends explicit location_config calls to the node following the + /// re-attach request. + pub empty_local_disk: Option, } #[derive(Serialize, Deserialize, Debug)] diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 8da4cee4b9..96829bd6ea 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -42,6 +42,7 @@ pub trait StorageControllerUpcallApi { fn re_attach( &self, conf: &PageServerConf, + empty_local_disk: bool, ) -> impl Future< Output = Result, RetryForeverError>, > + Send; @@ -155,6 +156,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { async fn re_attach( &self, conf: &PageServerConf, + empty_local_disk: bool, ) -> Result, RetryForeverError> { let url = self .base_url @@ -226,6 +228,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = ReAttachRequest { node_id: self.node_id, register: register.clone(), + empty_local_disk: Some(empty_local_disk), }; let response: ReAttachResponse = self diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 7854fd9e36..51581ccc2c 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -768,6 +768,7 @@ mod test { async fn re_attach( &self, _conf: &PageServerConf, + _empty_local_disk: bool, ) -> Result, RetryForeverError> { unimplemented!() } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 9b196ae393..b47bab16d8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -352,7 +352,8 @@ async fn init_load_generations( let client = StorageControllerUpcallClient::new(conf, cancel); info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. - match client.re_attach(conf).await { + let empty_local_disk = tenant_confs.is_empty(); + match client.re_attach(conf, empty_local_disk).await { Ok(tenants) => tenants .into_iter() .flat_map(|(id, rart)| { diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 5d21feeb10..34d4ac6fba 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -225,6 +225,10 @@ struct Cli { #[arg(long)] shard_split_request_timeout: Option, + + /// **Feature Flag** Whether the storage controller should act to rectify pageserver-reported local disk loss. + #[arg(long, default_value = "false")] + handle_ps_local_disk_loss: bool, } enum StrictMode { @@ -477,6 +481,7 @@ async fn async_main() -> anyhow::Result<()> { .shard_split_request_timeout .map(humantime::Duration::into) .unwrap_or(Duration::MAX), + handle_ps_local_disk_loss: args.handle_ps_local_disk_loss, }; // Validate that we can connect to the database diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 71186076ec..8f5efe8ac4 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -487,6 +487,9 @@ pub struct Config { /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None. pub shard_split_request_timeout: Duration, + + // Feature flag: Whether the storage controller should act to rectify pageserver-reported local disk loss. + pub handle_ps_local_disk_loss: bool, } impl From for ApiError { @@ -2388,6 +2391,33 @@ impl Service { tenants: Vec::new(), }; + // [Hadron] If the pageserver reports in the reattach message that it has an empty disk, it's possible that it just + // recovered from a local disk failure. The response of the reattach request will contain a list of tenants but it + // will not be honored by the pageserver in this case (disk failure). We should make sure we clear any observed + // locations of tenants attached to the node so that the reconciler will discover the discrpancy and reconfigure the + // missing tenants on the node properly. + if self.config.handle_ps_local_disk_loss && reattach_req.empty_local_disk.unwrap_or(false) { + tracing::info!( + "Pageserver {node_id} reports empty local disk, clearing observed locations referencing the pageserver for all tenants", + node_id = reattach_req.node_id + ); + let mut num_tenant_shards_affected = 0; + for (tenant_shard_id, shard) in tenants.iter_mut() { + if shard + .observed + .locations + .remove(&reattach_req.node_id) + .is_some() + { + tracing::info!("Cleared observed location for tenant shard {tenant_shard_id}"); + num_tenant_shards_affected += 1; + } + } + tracing::info!( + "Cleared observed locations for {num_tenant_shards_affected} tenant shards" + ); + } + // TODO: cancel/restart any running reconciliation for this tenant, it might be trying // to call location_conf API with an old generation. Wait for cancellation to complete // before responding to this request. Requires well implemented CancellationToken logic diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index f33d4a0d22..5ad00d155e 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -400,6 +400,7 @@ class NeonLocalCli(AbstractNeonCli): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ): cmd = ["storage_controller", "start"] if timeout_in_seconds is not None: @@ -408,6 +409,10 @@ class NeonLocalCli(AbstractNeonCli): cmd.append(f"--instance-id={instance_id}") if base_port is not None: cmd.append(f"--base-port={base_port}") + if handle_ps_local_disk_loss is not None: + cmd.append( + f"--handle-ps-local-disk-loss={'true' if handle_ps_local_disk_loss else 'false'}" + ) return self.raw_cli(cmd) def storage_controller_stop(self, immediate: bool, instance_id: int | None = None): diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1ce34a2c4e..eb7f826873 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1938,9 +1938,12 @@ class NeonStorageController(MetricsGetter, LogUtils): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ) -> Self: assert not self.running - self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.env.neon_cli.storage_controller_start( + timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss + ) self.running = True return self @@ -2838,10 +2841,13 @@ class NeonProxiedStorageController(NeonStorageController): timeout_in_seconds: int | None = None, instance_id: int | None = None, base_port: int | None = None, + handle_ps_local_disk_loss: bool | None = None, ) -> Self: assert instance_id is not None and base_port is not None - self.env.neon_cli.storage_controller_start(timeout_in_seconds, instance_id, base_port) + self.env.neon_cli.storage_controller_start( + timeout_in_seconds, instance_id, base_port, handle_ps_local_disk_loss + ) self.instances[instance_id] = {"running": True} self.running = True diff --git a/test_runner/regress/test_hcc_handling_ps_data_loss.py b/test_runner/regress/test_hcc_handling_ps_data_loss.py new file mode 100644 index 0000000000..35d3b72923 --- /dev/null +++ b/test_runner/regress/test_hcc_handling_ps_data_loss.py @@ -0,0 +1,47 @@ +import shutil + +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.utils import query_scalar + + +def test_hcc_handling_ps_data_loss( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that following a pageserver local data loss event, the system can recover automatically (i.e. + rehydrating the restarted pageserver from remote storage) without manual intervention. The + pageserver indicates to the storage controller that it has restarted without any local tenant + data in its "reattach" request and the storage controller uses this information to detect the + data loss condition and reconfigure the pageserver as necessary. + """ + env = neon_env_builder.init_configs() + env.broker.start() + env.storage_controller.start(handle_ps_local_disk_loss=True) + env.pageserver.start() + for sk in env.safekeepers: + sk.start() + + # create new nenant + tenant_id, _ = env.create_tenant(shard_count=4) + + endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) + with endpoint.cursor() as cur: + cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')") + cur.execute("CREATE DATABASE testdb") + + with endpoint.cursor(dbname="testdb") as cur: + cur.execute("CREATE TABLE tbl_one_hundred_rows AS SELECT generate_series(1,100)") + endpoint.stop() + + # Kill the pageserver, remove the `tenants/` directory, and restart. This simulates a pageserver + # that restarted with the same ID but has lost all its local disk data. + env.pageserver.stop(immediate=True) + shutil.rmtree(env.pageserver.tenant_dir()) + env.pageserver.start() + + # Test that the endpoint can start and query the database after the pageserver restarts. This + # indirectly tests that the pageserver was able to rehydrate the tenant data it lost from remote + # storage automatically. + endpoint.start() + with endpoint.cursor(dbname="testdb") as cur: + assert query_scalar(cur, "SELECT count(*) FROM tbl_one_hundred_rows") == 100