diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index fb7b4356d1..33b3d88c25 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -598,7 +598,10 @@ async fn handle_tenant_timeline_passthrough( let _timer = latency.start_timer(labels.clone()); - let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref()); + let client = mgmt_api::Client::new( + node.base_url(), + service.get_config().pageserver_jwt_token.as_deref(), + ); let resp = client.get_raw(path).await.map_err(|e| // We return 503 here because if we can't successfully send a request to the pageserver, // either we aren't available or the pageserver is unavailable. diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index be074d269d..18922b9e05 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -53,6 +53,10 @@ struct Cli { #[arg(long)] jwt_token: Option, + /// Token for authenticating this service with the safekeepers it controls + #[arg(long)] + safekeeper_jwt_token: Option, + /// Token for authenticating this service with the control plane, when calling /// the compute notification endpoint #[arg(long)] @@ -153,7 +157,8 @@ impl Default for StrictMode { struct Secrets { database_url: String, public_key: Option, - jwt_token: Option, + pageserver_jwt_token: Option, + safekeeper_jwt_token: Option, control_plane_jwt_token: Option, peer_jwt_token: Option, } @@ -161,6 +166,7 @@ struct Secrets { impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; + const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN"; const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY"; @@ -184,7 +190,14 @@ impl Secrets { let this = Self { database_url, public_key, - jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV), + pageserver_jwt_token: Self::load_secret( + &args.jwt_token, + Self::PAGESERVER_JWT_TOKEN_ENV, + ), + safekeeper_jwt_token: Self::load_secret( + &args.safekeeper_jwt_token, + Self::SAFEKEEPER_JWT_TOKEN_ENV, + ), control_plane_jwt_token: Self::load_secret( &args.control_plane_jwt_token, Self::CONTROL_PLANE_JWT_TOKEN_ENV, @@ -264,11 +277,17 @@ async fn async_main() -> anyhow::Result<()> { let secrets = Secrets::load(&args).await?; + // TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below + tracing::info!( + "safekeeper_jwt_token set: {:?}", + secrets.safekeeper_jwt_token.is_some() + ); + // Validate required secrets and arguments are provided in strict mode match strict_mode { StrictMode::Strict if (secrets.public_key.is_none() - || secrets.jwt_token.is_none() + || secrets.pageserver_jwt_token.is_none() || secrets.control_plane_jwt_token.is_none()) => { // Production systems should always have secrets configured: if public_key was not set @@ -293,7 +312,8 @@ async fn async_main() -> anyhow::Result<()> { } let config = Config { - jwt_token: secrets.jwt_token, + pageserver_jwt_token: secrets.pageserver_jwt_token, + safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 48f0804926..4fda7338e5 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -296,7 +296,7 @@ impl Reconciler { .location_config(tenant_shard_id, config.clone(), flush_ms, lazy) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, timeout, @@ -417,7 +417,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); client @@ -440,7 +440,7 @@ impl Reconciler { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.service_config.jwt_token.as_deref(), + self.service_config.pageserver_jwt_token.as_deref(), ); let timelines = client.timeline_list(&tenant_shard_id).await?; @@ -478,7 +478,7 @@ impl Reconciler { ) .await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, request_download_timeout * 2, @@ -771,7 +771,7 @@ impl Reconciler { let observed_conf = match attached_node .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 1, Duration::from_secs(5), @@ -1099,7 +1099,7 @@ impl Reconciler { match origin .with_client_retries( |client| async move { client.get_location_config(tenant_shard_id).await }, - &self.service_config.jwt_token, + &self.service_config.pageserver_jwt_token, 1, 3, Duration::from_secs(5), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 25a1cb4252..1bff5a37db 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -348,7 +348,12 @@ pub struct Config { // All pageservers managed by one instance of this service must have // the same public key. This JWT token will be used to authenticate // this service to the pageservers it manages. - pub jwt_token: Option, + pub pageserver_jwt_token: Option, + + // All safekeepers managed by one instance of this service must have + // the same public key. This JWT token will be used to authenticate + // this service to the safekeepers it manages. + pub safekeeper_jwt_token: Option, // This JWT token will be used to authenticate this service to the control plane. pub control_plane_jwt_token: Option, @@ -882,7 +887,7 @@ impl Service { let response = node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, timeout, @@ -983,7 +988,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); match client .location_config( @@ -1553,14 +1558,14 @@ impl Service { let reconcilers_cancel = cancel.child_token(); let heartbeater_ps = Heartbeater::new( - config.jwt_token.clone(), + config.pageserver_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), ); let heartbeater_sk = Heartbeater::new( - config.jwt_token.clone(), + config.safekeeper_jwt_token.clone(), config.max_offline_interval, config.max_warming_up_interval, cancel.clone(), @@ -1907,7 +1912,7 @@ impl Service { let configs = match node .with_client_retries( |client| async move { client.list_location_config().await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -1965,7 +1970,7 @@ impl Service { .location_config(tenant_shard_id, config, None, false) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 5, SHORT_RECONCILE_TIMEOUT, @@ -3100,7 +3105,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",); @@ -3161,7 +3166,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); futs.push(async move { let result = client @@ -3284,7 +3289,7 @@ impl Service { .tenant_delete(TenantShardId::unsharded(tenant_id)) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, RECONCILE_TIMEOUT, @@ -3503,7 +3508,7 @@ impl Service { let timeline_info = create_one( shard_zero_tid, shard_zero_locations, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), create_req.clone(), ) .await?; @@ -3519,7 +3524,7 @@ impl Service { // Create timeline on remaining shards with number >0 if !targets.0.is_empty() { // If we had multiple shards, issue requests for the remainder now. - let jwt = &self.config.jwt_token; + let jwt = &self.config.pageserver_jwt_token; self.tenant_for_shards( targets .0 @@ -3602,7 +3607,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), req.clone(), )) }) @@ -3683,7 +3688,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -3757,7 +3762,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), dir, )) }) @@ -3872,7 +3877,7 @@ impl Service { futs.push(async move { node.with_client_retries( |client| op(tenant_shard_id, client), - &self.config.jwt_token, + &self.config.pageserver_jwt_token, warn_threshold, max_retries, timeout, @@ -4121,7 +4126,7 @@ impl Service { tenant_shard_id, timeline_id, node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), )) }) .await?; @@ -4143,7 +4148,7 @@ impl Service { shard_zero_tid, timeline_id, shard_zero_locations.latest.node, - self.config.jwt_token.clone(), + self.config.pageserver_jwt_token.clone(), ) .await?; Ok(shard_zero_status) @@ -4542,7 +4547,7 @@ impl Service { client.location_config(child_id, config, None, false).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 10, Duration::from_secs(5), @@ -5142,7 +5147,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let response = client .tenant_shard_split( @@ -5468,7 +5473,7 @@ impl Service { let client = PageserverClient::new( node.get_id(), node.base_url(), - self.config.jwt_token.as_deref(), + self.config.pageserver_jwt_token.as_deref(), ); let scan_result = client @@ -7094,7 +7099,7 @@ impl Service { match attached_node .with_client_retries( |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7130,7 +7135,7 @@ impl Service { ) .await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 10, SHORT_RECONCILE_TIMEOUT, @@ -7185,7 +7190,7 @@ impl Service { let request = request_ref.clone(); client.top_tenant_shards(request.clone()).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 3, 3, Duration::from_secs(5), @@ -7358,7 +7363,7 @@ impl Service { match node .with_client_retries( |client| async move { client.tenant_secondary_status(tenant_shard_id).await }, - &self.config.jwt_token, + &self.config.pageserver_jwt_token, 1, 3, Duration::from_millis(250),