From 0099dfa56b1b24519b4948fe8705006c79b484a7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 25 Mar 2024 11:52:33 +0000 Subject: [PATCH] storage controller: tighten up secrets handling (#7105) - Remove code for using AWS secrets manager, as we're deploying with k8s->env vars instead - Load each secret independently, so that one can mix CLI args with environment variables, rather than requiring that all secrets are loaded with the same mechanism. - Add a 'strict mode', enabled by default, which will refuse to start if secrets are not loaded. This avoids the risk of accidentially disabling auth by omitting the public key, for example --- Cargo.lock | 24 --- Cargo.toml | 1 - control_plane/attachment_service/Cargo.toml | 1 - control_plane/attachment_service/src/main.rs | 180 ++++++++---------- control_plane/src/storage_controller.rs | 1 + .../fixtures/pageserver/allowed_errors.py | 2 + 6 files changed, 81 insertions(+), 128 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6409c79ef9..45b802c54f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -276,7 +276,6 @@ version = "0.1.0" dependencies = [ "anyhow", "aws-config", - "aws-sdk-secretsmanager", "bytes", "camino", "clap", @@ -433,29 +432,6 @@ dependencies = [ "url", ] -[[package]] -name = "aws-sdk-secretsmanager" -version = "1.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a0b64e61e7d632d9df90a2e0f32630c68c24960cab1d27d848718180af883d3" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand 2.0.0", - "http 0.2.9", - "once_cell", - "regex-lite", - "tracing", -] - [[package]] name = "aws-sdk-sso" version = "1.12.0" diff --git a/Cargo.toml b/Cargo.toml index 4dda63ff58..309ebbe119 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -52,7 +52,6 @@ async-stream = "0.3" async-trait = "0.1" aws-config = { version = "1.1.4", default-features = false, features=["rustls"] } aws-sdk-s3 = "1.14" -aws-sdk-secretsmanager = { version = "1.14.0" } aws-sdk-iam = "1.15.0" aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] } aws-smithy-types = "1.1.4" diff --git a/control_plane/attachment_service/Cargo.toml b/control_plane/attachment_service/Cargo.toml index 34882659e3..0201e0ed86 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/control_plane/attachment_service/Cargo.toml @@ -16,7 +16,6 @@ testing = [] [dependencies] anyhow.workspace = true aws-config.workspace = true -aws-sdk-secretsmanager.workspace = true bytes.workspace = true camino.workspace = true clap.workspace = true diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index 0a925a63f6..bd8d7f5c59 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -3,7 +3,6 @@ use attachment_service::http::make_router; use attachment_service::metrics::preinitialize_metrics; use attachment_service::persistence::Persistence; use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; -use aws_config::{BehaviorVersion, Region}; use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; @@ -55,11 +54,31 @@ struct Cli { #[arg(long)] database_url: Option, + /// Flag to enable dev mode, which permits running without auth + #[arg(long, default_value = "false")] + dev: bool, + /// Grace period before marking unresponsive pageserver offline #[arg(long)] max_unavailable_interval: Option, } +enum StrictMode { + /// In strict mode, we will require that all secrets are loaded, i.e. security features + /// may not be implicitly turned off by omitting secrets in the environment. + Strict, + /// In dev mode, secrets are optional, and omitting a particular secret will implicitly + /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated + /// requests, no public key -> don't authenticate incoming requests). + Dev, +} + +impl Default for StrictMode { + fn default() -> Self { + Self::Strict + } +} + /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this /// type encapsulates the logic to decide which and do the loading. struct Secrets { @@ -70,13 +89,6 @@ struct Secrets { } impl Secrets { - const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url"; - const PAGESERVER_JWT_TOKEN_SECRET: &'static str = - "neon-storage-controller-pageserver-jwt-token"; - const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str = - "neon-storage-controller-control-plane-jwt-token"; - const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key"; - const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN"; @@ -87,111 +99,41 @@ impl Secrets { /// - Environment variables if DATABASE_URL is set. /// - AWS Secrets Manager secrets async fn load(args: &Cli) -> anyhow::Result { - match &args.database_url { - Some(url) => Self::load_cli(url, args), - None => match std::env::var(Self::DATABASE_URL_ENV) { - Ok(database_url) => Self::load_env(database_url), - Err(_) => Self::load_aws_sm().await, - }, - } - } - - fn load_env(database_url: String) -> anyhow::Result { - let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) { - Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?), - Err(_) => None, - }; - Ok(Self { - database_url, - public_key, - jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(), - control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(), - }) - } - - async fn load_aws_sm() -> anyhow::Result { - let Ok(region) = std::env::var("AWS_REGION") else { - anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets"); - }; - let config = aws_config::defaults(BehaviorVersion::v2023_11_09()) - .region(Region::new(region.clone())) - .load() - .await; - - let asm = aws_sdk_secretsmanager::Client::new(&config); - - let Some(database_url) = asm - .get_secret_value() - .secret_id(Self::DATABASE_URL_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string) + let Some(database_url) = + Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await else { anyhow::bail!( - "Database URL secret not found at {region}/{}", - Self::DATABASE_URL_SECRET + "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)" ) }; - let jwt_token = asm - .get_secret_value() - .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - if jwt_token.is_none() { - tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver"); - } - - let control_plane_jwt_token = asm - .get_secret_value() - .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - if jwt_token.is_none() { - tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver"); - } - - let public_key = asm - .get_secret_value() - .secret_id(Self::PUBLIC_KEY_SECRET) - .send() - .await? - .secret_string() - .map(str::to_string); - let public_key = match public_key { - Some(key) => Some(JwtAuth::from_key(key)?), - None => { - tracing::warn!( - "No public key set: inccoming HTTP requests will not be authenticated" - ); - None - } + let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await { + Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?), + None => None, }; - Ok(Self { + let this = Self { database_url, public_key, - jwt_token, - control_plane_jwt_token, - }) + jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await, + control_plane_jwt_token: Self::load_secret( + &args.control_plane_jwt_token, + Self::CONTROL_PLANE_JWT_TOKEN_ENV, + ) + .await, + }; + + Ok(this) } - fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result { - let public_key = match &args.public_key { - None => None, - Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?), - }; - Ok(Self { - database_url: database_url.to_owned(), - public_key, - jwt_token: args.jwt_token.clone(), - control_plane_jwt_token: args.control_plane_jwt_token.clone(), - }) + async fn load_secret(cli: &Option, env_name: &str) -> Option { + if let Some(v) = cli { + Some(v.clone()) + } else if let Ok(v) = std::env::var(env_name) { + Some(v) + } else { + None + } } } @@ -247,8 +189,42 @@ async fn async_main() -> anyhow::Result<()> { args.listen ); + let strict_mode = if args.dev { + StrictMode::Dev + } else { + StrictMode::Strict + }; + let secrets = Secrets::load(&args).await?; + // Validate required secrets and arguments are provided in strict mode + match strict_mode { + StrictMode::Strict + if (secrets.public_key.is_none() + || secrets.jwt_token.is_none() + || secrets.control_plane_jwt_token.is_none()) => + { + // Production systems should always have secrets configured: if public_key was not set + // then we would implicitly disable auth. + anyhow::bail!( + "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" + ); + } + StrictMode::Strict if args.compute_hook_url.is_none() => { + // Production systems should always have a compute hook set, to prevent falling + // back to trying to use neon_local. + anyhow::bail!( + "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + ); + } + StrictMode::Strict => { + tracing::info!("Starting in strict mode: configuration is OK.") + } + StrictMode::Dev => { + tracing::warn!("Starting in dev mode: this may be an insecure configuration.") + } + } + let config = Config { jwt_token: secrets.jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index e7697ecac8..7f2b973391 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -279,6 +279,7 @@ impl StorageController { &self.listen, "-p", self.path.as_ref(), + "--dev", "--database-url", &database_url, "--max-unavailable-interval", diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index ec0f81b380..d7f682dad3 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -96,6 +96,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*ReceiveBody.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", + # Tests run in dev mode + ".*Starting in dev mode.*", ]