storcon: infrastructure for safekeeper specific JWT tokens (#10905)

Safekeepers only respond to requests with the per-token scope, or the
`safekeeperdata` JWT scope. Therefore, add infrastructure in the storage
controller for safekeeper JWTs. Also, rename the ambiguous `jwt_token`
to `pageserver_jwt_token`.

Part of #9011
Related: https://github.com/neondatabase/cloud/issues/24727
This commit is contained in:
Arpad Müller
2025-02-21 12:02:02 +01:00
committed by GitHub
parent f927ae6e15
commit ff3819efc7
4 changed files with 64 additions and 36 deletions

View File

@@ -598,7 +598,10 @@ async fn handle_tenant_timeline_passthrough(
let _timer = latency.start_timer(labels.clone());
let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
let client = mgmt_api::Client::new(
node.base_url(),
service.get_config().pageserver_jwt_token.as_deref(),
);
let resp = client.get_raw(path).await.map_err(|e|
// We return 503 here because if we can't successfully send a request to the pageserver,
// either we aren't available or the pageserver is unavailable.

View File

@@ -53,6 +53,10 @@ struct Cli {
#[arg(long)]
jwt_token: Option<String>,
/// Token for authenticating this service with the safekeepers it controls
#[arg(long)]
safekeeper_jwt_token: Option<String>,
/// Token for authenticating this service with the control plane, when calling
/// the compute notification endpoint
#[arg(long)]
@@ -153,7 +157,8 @@ impl Default for StrictMode {
struct Secrets {
database_url: String,
public_key: Option<JwtAuth>,
jwt_token: Option<String>,
pageserver_jwt_token: Option<String>,
safekeeper_jwt_token: Option<String>,
control_plane_jwt_token: Option<String>,
peer_jwt_token: Option<String>,
}
@@ -161,6 +166,7 @@ struct Secrets {
impl Secrets {
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
const SAFEKEEPER_JWT_TOKEN_ENV: &'static str = "SAFEKEEPER_JWT_TOKEN";
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
const PEER_JWT_TOKEN_ENV: &'static str = "PEER_JWT_TOKEN";
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
@@ -184,7 +190,14 @@ impl Secrets {
let this = Self {
database_url,
public_key,
jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV),
pageserver_jwt_token: Self::load_secret(
&args.jwt_token,
Self::PAGESERVER_JWT_TOKEN_ENV,
),
safekeeper_jwt_token: Self::load_secret(
&args.safekeeper_jwt_token,
Self::SAFEKEEPER_JWT_TOKEN_ENV,
),
control_plane_jwt_token: Self::load_secret(
&args.control_plane_jwt_token,
Self::CONTROL_PLANE_JWT_TOKEN_ENV,
@@ -264,11 +277,17 @@ async fn async_main() -> anyhow::Result<()> {
let secrets = Secrets::load(&args).await?;
// TODO: once we've rolled out the safekeeper JWT token everywhere, put it into the validation code below
tracing::info!(
"safekeeper_jwt_token set: {:?}",
secrets.safekeeper_jwt_token.is_some()
);
// Validate required secrets and arguments are provided in strict mode
match strict_mode {
StrictMode::Strict
if (secrets.public_key.is_none()
|| secrets.jwt_token.is_none()
|| secrets.pageserver_jwt_token.is_none()
|| secrets.control_plane_jwt_token.is_none()) =>
{
// Production systems should always have secrets configured: if public_key was not set
@@ -293,7 +312,8 @@ async fn async_main() -> anyhow::Result<()> {
}
let config = Config {
jwt_token: secrets.jwt_token,
pageserver_jwt_token: secrets.pageserver_jwt_token,
safekeeper_jwt_token: secrets.safekeeper_jwt_token,
control_plane_jwt_token: secrets.control_plane_jwt_token,
peer_jwt_token: secrets.peer_jwt_token,
compute_hook_url: args.compute_hook_url,

View File

@@ -296,7 +296,7 @@ impl Reconciler {
.location_config(tenant_shard_id, config.clone(), flush_ms, lazy)
.await
},
&self.service_config.jwt_token,
&self.service_config.pageserver_jwt_token,
1,
3,
timeout,
@@ -417,7 +417,7 @@ impl Reconciler {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.service_config.jwt_token.as_deref(),
self.service_config.pageserver_jwt_token.as_deref(),
);
client
@@ -440,7 +440,7 @@ impl Reconciler {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.service_config.jwt_token.as_deref(),
self.service_config.pageserver_jwt_token.as_deref(),
);
let timelines = client.timeline_list(&tenant_shard_id).await?;
@@ -478,7 +478,7 @@ impl Reconciler {
)
.await
},
&self.service_config.jwt_token,
&self.service_config.pageserver_jwt_token,
1,
3,
request_download_timeout * 2,
@@ -771,7 +771,7 @@ impl Reconciler {
let observed_conf = match attached_node
.with_client_retries(
|client| async move { client.get_location_config(tenant_shard_id).await },
&self.service_config.jwt_token,
&self.service_config.pageserver_jwt_token,
1,
1,
Duration::from_secs(5),
@@ -1099,7 +1099,7 @@ impl Reconciler {
match origin
.with_client_retries(
|client| async move { client.get_location_config(tenant_shard_id).await },
&self.service_config.jwt_token,
&self.service_config.pageserver_jwt_token,
1,
3,
Duration::from_secs(5),

View File

@@ -348,7 +348,12 @@ pub struct Config {
// All pageservers managed by one instance of this service must have
// the same public key. This JWT token will be used to authenticate
// this service to the pageservers it manages.
pub jwt_token: Option<String>,
pub pageserver_jwt_token: Option<String>,
// All safekeepers managed by one instance of this service must have
// the same public key. This JWT token will be used to authenticate
// this service to the safekeepers it manages.
pub safekeeper_jwt_token: Option<String>,
// This JWT token will be used to authenticate this service to the control plane.
pub control_plane_jwt_token: Option<String>,
@@ -882,7 +887,7 @@ impl Service {
let response = node
.with_client_retries(
|client| async move { client.list_location_config().await },
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
5,
timeout,
@@ -983,7 +988,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
self.config.pageserver_jwt_token.as_deref(),
);
match client
.location_config(
@@ -1553,14 +1558,14 @@ impl Service {
let reconcilers_cancel = cancel.child_token();
let heartbeater_ps = Heartbeater::new(
config.jwt_token.clone(),
config.pageserver_jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
);
let heartbeater_sk = Heartbeater::new(
config.jwt_token.clone(),
config.safekeeper_jwt_token.clone(),
config.max_offline_interval,
config.max_warming_up_interval,
cancel.clone(),
@@ -1907,7 +1912,7 @@ impl Service {
let configs = match node
.with_client_retries(
|client| async move { client.list_location_config().await },
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
5,
SHORT_RECONCILE_TIMEOUT,
@@ -1965,7 +1970,7 @@ impl Service {
.location_config(tenant_shard_id, config, None, false)
.await
},
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
5,
SHORT_RECONCILE_TIMEOUT,
@@ -3100,7 +3105,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
self.config.pageserver_jwt_token.as_deref(),
);
tracing::info!("Doing time travel recovery for shard {tenant_shard_id}",);
@@ -3161,7 +3166,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
self.config.pageserver_jwt_token.as_deref(),
);
futs.push(async move {
let result = client
@@ -3284,7 +3289,7 @@ impl Service {
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await
},
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
3,
RECONCILE_TIMEOUT,
@@ -3503,7 +3508,7 @@ impl Service {
let timeline_info = create_one(
shard_zero_tid,
shard_zero_locations,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
create_req.clone(),
)
.await?;
@@ -3519,7 +3524,7 @@ impl Service {
// Create timeline on remaining shards with number >0
if !targets.0.is_empty() {
// If we had multiple shards, issue requests for the remainder now.
let jwt = &self.config.jwt_token;
let jwt = &self.config.pageserver_jwt_token;
self.tenant_for_shards(
targets
.0
@@ -3602,7 +3607,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
req.clone(),
))
})
@@ -3683,7 +3688,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
))
})
.await?;
@@ -3757,7 +3762,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
dir,
))
})
@@ -3872,7 +3877,7 @@ impl Service {
futs.push(async move {
node.with_client_retries(
|client| op(tenant_shard_id, client),
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
warn_threshold,
max_retries,
timeout,
@@ -4121,7 +4126,7 @@ impl Service {
tenant_shard_id,
timeline_id,
node,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
))
})
.await?;
@@ -4143,7 +4148,7 @@ impl Service {
shard_zero_tid,
timeline_id,
shard_zero_locations.latest.node,
self.config.jwt_token.clone(),
self.config.pageserver_jwt_token.clone(),
)
.await?;
Ok(shard_zero_status)
@@ -4542,7 +4547,7 @@ impl Service {
client.location_config(child_id, config, None, false).await
},
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
10,
Duration::from_secs(5),
@@ -5142,7 +5147,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
self.config.pageserver_jwt_token.as_deref(),
);
let response = client
.tenant_shard_split(
@@ -5468,7 +5473,7 @@ impl Service {
let client = PageserverClient::new(
node.get_id(),
node.base_url(),
self.config.jwt_token.as_deref(),
self.config.pageserver_jwt_token.as_deref(),
);
let scan_result = client
@@ -7094,7 +7099,7 @@ impl Service {
match attached_node
.with_client_retries(
|client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
@@ -7130,7 +7135,7 @@ impl Service {
)
.await
},
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
3,
10,
SHORT_RECONCILE_TIMEOUT,
@@ -7185,7 +7190,7 @@ impl Service {
let request = request_ref.clone();
client.top_tenant_shards(request.clone()).await
},
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
3,
3,
Duration::from_secs(5),
@@ -7358,7 +7363,7 @@ impl Service {
match node
.with_client_retries(
|client| async move { client.tenant_secondary_status(tenant_shard_id).await },
&self.config.jwt_token,
&self.config.pageserver_jwt_token,
1,
3,
Duration::from_millis(250),