From d2d9946bab867e111434861e2c3facec0d62f417 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 1 Jul 2025 16:47:16 +0400 Subject: [PATCH 01/22] tests: override safekeeper ports in storcon DB (#12410) ## Problem We persist safekeeper host/port in the storcon DB after https://github.com/neondatabase/neon/pull/11712, so the storcon fails to ping safekeepers in the compatibility tests, where we start the cluster from the snapshot. PR also adds some small code improvements related to the test failure. - Closes: https://github.com/neondatabase/neon/issues/12339 ## Summary of changes - Update safekeeper ports in the storcon DB when starting the neon from the dir (snapshot) - Fail the response on all not-success codes (e.g. 3xx). Should not happen, but just to be more safe. - Add `neon_previous/` to .gitignore to make it easier to run compat tests. - Add missing EXPORT to the instruction for running compat tests --- .gitignore | 1 + safekeeper/client/src/mgmt_api.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 12 +++++++++--- test_runner/regress/test_compatibility.py | 2 ++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 70c7e96303..6574d7b9de 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ /tmp_check_cli __pycache__/ test_output/ +neon_previous/ .vscode .idea *.swp diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 2e46a7b529..b4bb193a4b 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -52,7 +52,7 @@ pub trait ResponseErrorMessageExt: Sized { impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { + if status.is_success() { return Ok(self); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 508e3d8dd2..2031ec132e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -724,15 +724,21 @@ class NeonEnvBuilder: shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. - # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller - # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage + # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical. # So, from_repo_dir patches up the the storcon database. patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" assert not patch_script_path.exists() patch_script = "" + for ps in self.env.pageservers: - patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';\n" + + for sk in self.env.safekeepers: + patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 16ab2bb359..a4d2bf8d9b 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -76,6 +76,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -102,6 +103,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # export NEON_BIN=target/${BUILD_TYPE} # export POSTGRES_DISTRIB_DIR=pg_install # From 6d73cfa6085b321348a1defeb1f29c76238f3ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 1 Jul 2025 15:53:46 +0300 Subject: [PATCH 02/22] Support audit syslog over TLS (#12124) Add support to transport syslogs over TLS. Since TLS params essentially require passing host and port separately, add a boolean flag to the configuration template and also use the same `action` format for plaintext logs. This allows seamless transition. The plaintext host:port is picked from `AUDIT_LOGGING_ENDPOINT` (as earlier) and from `AUDIT_LOGGING_TLS_ENDPOINT`. The TLS host:port is used when defined and non-empty. `remote_endpoint` is split separately to hostname and port as required by `omfwd` module. Also the address parsing and config content generation are split to more testable functions with basic tests added. --- Cargo.lock | 7 + compute/compute-node.Dockerfile | 2 +- compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 14 +- .../compute_audit_rsyslog_template.conf | 36 ++- compute_tools/src/rsyslog.rs | 224 +++++++++++++++++- 6 files changed, 270 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e640e62909..4c9cfa97e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1305,6 +1305,7 @@ dependencies = [ "fail", "flate2", "futures", + "hostname-validator", "http 1.1.0", "indexmap 2.9.0", "itertools 0.10.5", @@ -2771,6 +2772,12 @@ dependencies = [ "windows", ] +[[package]] +name = "hostname-validator" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f558a64ac9af88b5ba400d99b579451af0d39c6d360980045b91aac966d705e2" + [[package]] name = "http" version = "0.2.9" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index bce2a28b8b..111e64d5d1 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1983,7 +1983,7 @@ RUN apt update && \ locales \ lsof \ procps \ - rsyslog \ + rsyslog-gnutls \ screen \ tcpdump \ $VERSION_INSTALLS && \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0a071c1ad1..1a03022d89 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,6 +27,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +hostname-validator = "1.1" indexmap.workspace = true itertools.workspace = true jsonwebtoken.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 9dcd4fc17c..fae76579d8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -759,10 +759,15 @@ impl ComputeNode { // Configure and start rsyslog for compliance audit logging match pspec.spec.audit_log_level { ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { - let remote_endpoint = + let remote_tls_endpoint = + std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string()); + let remote_plain_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + + if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() { + anyhow::bail!( + "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty" + ); } let log_directory_path = Path::new(&self.params.pgdata).join("log"); @@ -778,7 +783,8 @@ impl ComputeNode { log_directory_path.clone(), endpoint_id, project_id, - &remote_endpoint, + &remote_plain_endpoint, + &remote_tls_endpoint, )?; // Launch a background task to clean up the audit logs diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index 48b1a6f5c3..f072f652cf 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -10,7 +10,13 @@ input(type="imfile" File="{log_directory}/*.log" startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,") # the directory to store rsyslog state files -global(workDirectory="/var/log/rsyslog") +global( + workDirectory="/var/log/rsyslog" + DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt" +) + +# Whether the remote syslog receiver uses tls +set $.remote_syslog_tls = "{remote_syslog_tls}"; # Construct json, endpoint_id and project_id as additional metadata set $.json_log!endpoint_id = "{endpoint_id}"; @@ -21,5 +27,29 @@ set $.json_log!msg = $msg; template(name="PgAuditLog" type="string" string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%") -# Forward to remote syslog receiver (@@:;format -local5.info @@{remote_endpoint};PgAuditLog +# Forward to remote syslog receiver (over TLS) +if ( $syslogtag == 'pgaudit_log' ) then {{ + if ( $.remote_syslog_tls == 'true' ) then {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + StreamDriver="gtls" + StreamDriverMode="1" + StreamDriverAuthMode="x509/name" + StreamDriverPermittedPeers="{remote_syslog_host}" + StreamDriver.CheckExtendedKeyPurpose="on" + StreamDriver.PermitExpiredCerts="off" + ) + stop + }} else {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + ) + stop + }} +}} diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 3bc2e72b19..3ced0a5654 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -4,8 +4,10 @@ use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs::OpenOptions, io::Write}; +use url::{Host, Url}; use anyhow::{Context, Result, anyhow}; +use hostname_validator; use tracing::{error, info, instrument, warn}; const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; @@ -82,18 +84,84 @@ fn restart_rsyslog() -> Result<()> { Ok(()) } +fn parse_audit_syslog_address( + remote_plain_endpoint: &str, + remote_tls_endpoint: &str, +) -> Result<(String, u16, String)> { + let tls; + let remote_endpoint = if !remote_tls_endpoint.is_empty() { + tls = "true".to_string(); + remote_tls_endpoint + } else { + tls = "false".to_string(); + remote_plain_endpoint + }; + // Urlify the remote_endpoint, so parsing can be done with url::Url. + let url_str = format!("http://{remote_endpoint}"); + let url = Url::parse(&url_str).map_err(|err| { + anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}") + })?; + + let is_valid = url.scheme() == "http" + && url.path() == "/" + && url.query().is_none() + && url.fragment().is_none() + && url.username() == "" + && url.password().is_none(); + + if !is_valid { + return Err(anyhow!( + "Invalid address format {remote_endpoint}, expected host:port" + )); + } + let host = match url.host() { + Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(), + Some(Host::Ipv4(ip4)) => ip4.to_string(), + Some(Host::Ipv6(ip6)) => ip6.to_string(), + _ => return Err(anyhow!("Invalid host")), + }; + let port = url + .port() + .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?; + + Ok((host, port, tls)) +} + +fn generate_audit_rsyslog_config( + log_directory: String, + endpoint_id: &str, + project_id: &str, + remote_syslog_host: &str, + remote_syslog_port: u16, + remote_syslog_tls: &str, +) -> String { + format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + endpoint_id = endpoint_id, + project_id = project_id, + remote_syslog_host = remote_syslog_host, + remote_syslog_port = remote_syslog_port, + remote_syslog_tls = remote_syslog_tls + ) +} + pub fn configure_audit_rsyslog( log_directory: String, endpoint_id: &str, project_id: &str, remote_endpoint: &str, + remote_tls_endpoint: &str, ) -> Result<()> { - let config_content: String = format!( - include_str!("config_template/compute_audit_rsyslog_template.conf"), - log_directory = log_directory, - endpoint_id = endpoint_id, - project_id = project_id, - remote_endpoint = remote_endpoint + let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) = + parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap(); + let config_content = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + &remote_syslog_host, + remote_syslog_port, + &remote_syslog_tls, ); info!("rsyslog config_content: {}", config_content); @@ -258,6 +326,8 @@ pub fn launch_pgaudit_gc(log_directory: String) { mod tests { use crate::rsyslog::PostgresLogsRsyslogConfig; + use super::{generate_audit_rsyslog_config, parse_audit_syslog_address}; + #[test] fn test_postgres_logs_config() { { @@ -287,4 +357,146 @@ mod tests { assert!(res.is_err()); } } + + #[test] + fn test_parse_audit_syslog_address() { + { + // host:port format (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + ( + String::from("collector.host.tld"), + 5555, + String::from("false") + ) + ); + } + + { + // host:port format with ipv4 ip address (plaintext) + let parsed = parse_audit_syslog_address("10.0.0.1:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + (String::from("10.0.0.1"), 5555, String::from("false")) + ); + } + + { + // host:port format with ipv6 ip address (plaintext) + let parsed = + parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", ""); + assert_eq!( + parsed.unwrap(), + ( + String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"), + 5555, + String::from("false") + ) + ); + } + + { + // Only TLS host:port defined + let parsed = parse_audit_syslog_address("", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // tls host should take precedence, when both defined + let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // host without port (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld", ""); + assert!(parsed.is_err()); + } + + { + // port without host + let parsed = parse_audit_syslog_address(":5555", ""); + assert!(parsed.is_err()); + } + + { + // valid host with invalid port + let parsed = parse_audit_syslog_address("collector.host.tld:90001", ""); + assert!(parsed.is_err()); + } + + { + // invalid hostname with valid port + let parsed = parse_audit_syslog_address("-collector.host.tld:5555", ""); + assert!(parsed.is_err()); + } + + { + // parse error + let parsed = parse_audit_syslog_address("collector.host.tld:::5555", ""); + assert!(parsed.is_err()); + } + } + + #[test] + fn test_generate_audit_rsyslog_config() { + { + // plaintext version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5555; + let remote_syslog_tls = "false"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5555""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + + { + // TLS version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5556; + let remote_syslog_tls = "true"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5556""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + } } From 4932963bac31bac1f76ec6b39002e39c98292047 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 1 Jul 2025 14:03:34 +0100 Subject: [PATCH 03/22] [proxy]: dont log user errors from postgres (#12412) ## Problem #8843 User initiated sql queries are being classified as "postgres" errors, whereas they're really user errors. ## Summary of changes Classify user-initiated postgres errors as user errors if they are related to a sql query that we ran on their behalf. Do not log those errors. --- proxy/src/error.rs | 10 ------ proxy/src/serverless/backend.rs | 10 +++++- proxy/src/serverless/sql_over_http.rs | 48 ++++++++++++++++++++++----- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/proxy/src/error.rs b/proxy/src/error.rs index aa02b211d9..e880d63075 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -78,16 +78,6 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } -impl ReportableError for postgres_client::error::Error { - fn get_error_kind(&self) -> ErrorKind { - if self.as_db_error().is_some() { - ErrorKind::Postgres - } else { - ErrorKind::Compute - } - } -} - /// Flattens `Result>` into `Result`. pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 26269d0a6e..7708342ae3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -404,7 +404,15 @@ impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), + HttpConnError::PostgresConnectionError(p) => { + if p.as_db_error().is_some() { + // postgres rejected the connection + ErrorKind::Postgres + } else { + // couldn't even reach postgres + ErrorKind::Compute + } + } HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index b2eb801f5c..5d5e7bf83e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -22,7 +22,7 @@ use serde_json::Value; use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info}; +use tracing::{Level, debug, error, info}; use typed_json::json; use url::Url; use uuid::Uuid; @@ -390,12 +390,35 @@ pub(crate) async fn handle( let line = get(db_error, |db| db.line().map(|l| l.to_string())); let routine = get(db_error, |db| db.routine()); - tracing::info!( - kind=error_kind.to_metric_label(), - error=%e, - msg=message, - "forwarding error to user" - ); + match &e { + SqlOverHttpError::Postgres(e) + if e.as_db_error().is_some() && error_kind == ErrorKind::User => + { + // this error contains too much info, and it's not an error we care about. + if tracing::enabled!(Level::DEBUG) { + tracing::debug!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } else { + tracing::info!( + kind = error_kind.to_metric_label(), + error = "bad query", + "forwarding error to user" + ); + } + } + _ => { + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } + } json_response( e.get_http_status_code(), @@ -460,7 +483,15 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, - SqlOverHttpError::Postgres(p) => p.get_error_kind(), + // customer initiated SQL errors. + SqlOverHttpError::Postgres(p) => { + if p.as_db_error().is_some() { + ErrorKind::User + } else { + ErrorKind::Compute + } + } + // proxy initiated SQL errors. SqlOverHttpError::InternalPostgres(p) => { if p.as_db_error().is_some() { ErrorKind::Service @@ -468,6 +499,7 @@ impl ReportableError for SqlOverHttpError { ErrorKind::Compute } } + // postgres returned a bad row format that we couldn't parse. SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, SqlOverHttpError::Cancelled(c) => c.get_error_kind(), } From 0934ce9bcecd21c180ad7dae3f3f4028e9f90127 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 1 Jul 2025 11:33:23 -0400 Subject: [PATCH 04/22] compute: metrics for autovacuum (mxid, postgres) (#12294) ## Problem Currently we do not have metrics for autovacuum. ## Summary of changes Added a metric that extracts the top 5 DBs with oldest mxid and frozen xid. Tables that were vacuumed recently should have younger value (or younger age). Related Issue: https://github.com/neondatabase/cloud/issues/27296 --- compute/etc/neon_collector.jsonnet | 2 ++ .../compute_pg_oldest_frozen_xid_age.libsonnet | 13 +++++++++++++ .../compute_pg_oldest_frozen_xid_age.sql | 4 ++++ .../compute_pg_oldest_mxid_age.libsonnet | 13 +++++++++++++ .../etc/sql_exporter/compute_pg_oldest_mxid_age.sql | 4 ++++ 5 files changed, 36 insertions(+) create mode 100644 compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet create mode 100644 compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql create mode 100644 compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet create mode 100644 compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index e64d907fe4..b712631d71 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -8,6 +8,8 @@ import 'sql_exporter/compute_logical_snapshot_files.libsonnet', import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet', import 'sql_exporter/compute_max_connections.libsonnet', + import 'sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet', + import 'sql_exporter/compute_pg_oldest_mxid_age.libsonnet', import 'sql_exporter/compute_receive_lsn.libsonnet', import 'sql_exporter/compute_subscriptions_count.libsonnet', import 'sql_exporter/connection_counts.libsonnet', diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet new file mode 100644 index 0000000000..03d5cf860f --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'compute_pg_oldest_frozen_xid_age', + type: 'gauge', + help: 'Age of oldest XIDs that have not been frozen by VACUUM. An indicator of how long it has been since VACUUM last ran.', + key_labels: [ + 'database_name', + ], + value_label: 'metric', + values: [ + 'frozen_xid_age', + ], + query: importstr 'sql_exporter/compute_pg_oldest_frozen_xid_age.sql', +} diff --git a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql new file mode 100644 index 0000000000..d2281fdd42 --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql @@ -0,0 +1,4 @@ +SELECT datname database_name, + age(datfrozenxid) frozen_xid_age +FROM pg_database +ORDER BY frozen_xid_age DESC LIMIT 10; diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet new file mode 100644 index 0000000000..12063a0f71 --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'compute_pg_oldest_mxid_age', + type: 'gauge', + help: 'Age of oldest MXIDs that have not been replaced by VACUUM. An indicator of how long it has been since VACUUM last ran.', + key_labels: [ + 'database_name', + ], + value_label: 'metric', + values: [ + 'min_mxid_age', + ], + query: importstr 'sql_exporter/compute_pg_oldest_mxid_age.sql', +} diff --git a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql new file mode 100644 index 0000000000..ed57894b3a --- /dev/null +++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql @@ -0,0 +1,4 @@ +SELECT datname database_name, + mxid_age(datminmxid) min_mxid_age +FROM pg_database +ORDER BY min_mxid_age DESC LIMIT 10; From bbcd70eab375c3d524fef74790653b995456bfd1 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 1 Jul 2025 12:50:27 -0400 Subject: [PATCH 05/22] Dynamic Masking Support for `anon` v2 (#11733) ## Problem This PR works on adding dynamic masking support for `anon` v2. It currently only supports static masking. ## Summary of changes Added a security definer function that sets the dynamic masking guc to `true` with superuser permissions. Added a security definer function that adds `anon` to `session_preload_libraries` if it's not already present. Related to: https://github.com/neondatabase/cloud/issues/20456 --- compute/patches/anon_v2.patch | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch index e833a6dfd3..4faf927e39 100644 --- a/compute/patches/anon_v2.patch +++ b/compute/patches/anon_v2.patch @@ -1,8 +1,8 @@ diff --git a/sql/anon.sql b/sql/anon.sql -index 0cdc769..f6cc950 100644 +index 0cdc769..b450327 100644 --- a/sql/anon.sql +++ b/sql/anon.sql -@@ -1141,3 +1141,8 @@ $$ +@@ -1141,3 +1141,15 @@ $$ -- TODO : https://en.wikipedia.org/wiki/L-diversity -- TODO : https://en.wikipedia.org/wiki/T-closeness @@ -11,6 +11,13 @@ index 0cdc769..f6cc950 100644 + +GRANT ALL ON SCHEMA anon to neon_superuser; +GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser; ++ ++DO $$ ++BEGIN ++ IF current_setting('server_version_num')::int >= 150000 THEN ++ GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser; ++ END IF; ++END $$; diff --git a/sql/init.sql b/sql/init.sql index 7da6553..9b6164b 100644 --- a/sql/init.sql From 3815e3b2b5809612ce335333134a3fd32317a0e4 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 1 Jul 2025 09:58:41 -0700 Subject: [PATCH 06/22] feat(pageserver): reduce lock contention in l0 compaction (#12360) ## Problem L0 compaction currently holds the read lock for a long region while it doesn't need to. ## Summary of changes This patch reduces the one long contention region into 2 short ones: gather the layers to compact at the beginning, and several short read locks when querying the image coverage. Co-Authored-By: Chen Luo --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 59 ++++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 04852fb721..13a4f82607 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -9,7 +9,7 @@ use std::ops::{Deref, Range}; use std::sync::Arc; use std::time::{Duration, Instant}; -use super::layer_manager::{LayerManagerLockHolder, LayerManagerReadGuard}; +use super::layer_manager::LayerManagerLockHolder; use super::{ CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, @@ -1779,20 +1779,14 @@ impl Timeline { } = { let phase1_span = info_span!("compact_level0_phase1"); let ctx = ctx.attached_child(); - let mut stats = CompactLevel0Phase1StatsBuilder { + let stats = CompactLevel0Phase1StatsBuilder { version: Some(2), tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; - let begin = tokio::time::Instant::now(); - let phase1_layers_locked = self.layers.read(LayerManagerLockHolder::Compaction).await; - let now = tokio::time::Instant::now(); - stats.read_lock_acquisition_micros = - DurationRecorder::Recorded(RecordedDuration(now - begin), now); self.compact_level0_phase1( - phase1_layers_locked, stats, target_file_size, force_compaction_ignore_threshold, @@ -1813,16 +1807,19 @@ impl Timeline { } /// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment. - async fn compact_level0_phase1<'a>( - self: &'a Arc, - guard: LayerManagerReadGuard<'a>, + async fn compact_level0_phase1( + self: &Arc, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { - stats.read_lock_held_spawn_blocking_startup_micros = - stats.read_lock_acquisition_micros.till_now(); // set by caller + let begin = tokio::time::Instant::now(); + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let now = tokio::time::Instant::now(); + stats.read_lock_acquisition_micros = + DurationRecorder::Recorded(RecordedDuration(now - begin), now); + let layers = guard.layer_map()?; let level0_deltas = layers.level0_deltas(); stats.level0_deltas_count = Some(level0_deltas.len()); @@ -1857,6 +1854,12 @@ impl Timeline { .map(|x| guard.get_from_desc(x)) .collect::>(); + drop_layer_manager_rlock(guard); + + // The is the last LSN that we have seen for L0 compaction in the timeline. This LSN might be updated + // by the time we finish the compaction. So we need to get it here. + let l0_last_record_lsn = self.get_last_record_lsn(); + // Gather the files to compact in this iteration. // // Start with the oldest Level 0 delta file, and collect any other @@ -1944,9 +1947,7 @@ impl Timeline { // we don't accidentally use it later in the function. drop(level0_deltas); - stats.read_lock_held_prerequisites_micros = stats - .read_lock_held_spawn_blocking_startup_micros - .till_now(); + stats.compaction_prerequisites_micros = stats.read_lock_acquisition_micros.till_now(); // TODO: replace with streaming k-merge let all_keys = { @@ -1968,7 +1969,7 @@ impl Timeline { all_keys }; - stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now(); + stats.read_lock_held_key_sort_micros = stats.compaction_prerequisites_micros.till_now(); // Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start. // @@ -2002,7 +2003,6 @@ impl Timeline { } } let max_holes = deltas_to_compact.len(); - let last_record_lsn = self.get_last_record_lsn(); let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128; let min_hole_coverage_size = 3; // TODO: something more flexible? // min-heap (reserve space for one more element added before eviction) @@ -2021,8 +2021,12 @@ impl Timeline { // has not so much sense, because largest holes will corresponds field1/field2 changes. // But we are mostly interested to eliminate holes which cause generation of excessive image layers. // That is why it is better to measure size of hole as number of covering image layers. - let coverage_size = - layers.image_coverage(&key_range, last_record_lsn).len(); + let coverage_size = { + // TODO: optimize this with copy-on-write layer map. + let guard = self.layers.read(LayerManagerLockHolder::Compaction).await; + let layers = guard.layer_map()?; + layers.image_coverage(&key_range, l0_last_record_lsn).len() + }; if coverage_size >= min_hole_coverage_size { heap.push(Hole { key_range, @@ -2041,7 +2045,6 @@ impl Timeline { holes }; stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now(); - drop_layer_manager_rlock(guard); if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); @@ -2382,9 +2385,8 @@ struct CompactLevel0Phase1StatsBuilder { tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, - read_lock_held_spawn_blocking_startup_micros: DurationRecorder, read_lock_held_key_sort_micros: DurationRecorder, - read_lock_held_prerequisites_micros: DurationRecorder, + compaction_prerequisites_micros: DurationRecorder, read_lock_held_compute_holes_micros: DurationRecorder, read_lock_drop_micros: DurationRecorder, write_layer_files_micros: DurationRecorder, @@ -2399,9 +2401,8 @@ struct CompactLevel0Phase1Stats { tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, - read_lock_held_spawn_blocking_startup_micros: RecordedDuration, read_lock_held_key_sort_micros: RecordedDuration, - read_lock_held_prerequisites_micros: RecordedDuration, + compaction_prerequisites_micros: RecordedDuration, read_lock_held_compute_holes_micros: RecordedDuration, read_lock_drop_micros: RecordedDuration, write_layer_files_micros: RecordedDuration, @@ -2426,16 +2427,12 @@ impl TryFrom for CompactLevel0Phase1Stats { .read_lock_acquisition_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_acquisition_micros not set"))?, - read_lock_held_spawn_blocking_startup_micros: value - .read_lock_held_spawn_blocking_startup_micros - .into_recorded() - .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?, read_lock_held_key_sort_micros: value .read_lock_held_key_sort_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?, - read_lock_held_prerequisites_micros: value - .read_lock_held_prerequisites_micros + compaction_prerequisites_micros: value + .compaction_prerequisites_micros .into_recorded() .ok_or_else(|| anyhow!("read_lock_held_prerequisites_micros not set"))?, read_lock_held_compute_holes_micros: value From b254dce8a1564af3eed5fea02c2511c9b40d435f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 1 Jul 2025 10:00:27 -0700 Subject: [PATCH 07/22] feat(pageserver): report compaction progress (#12401) ## Problem close https://github.com/neondatabase/neon/issues/11528 ## Summary of changes Gives us better observability of compaction progress. - Image creation: num of partition processed / total partition - Gc-compaction: index of the in the queue / total items for a full compaction - Shard ancestor compaction: layers to rewrite / total layers Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline.rs | 18 +++++++++++--- pageserver/src/tenant/timeline/compaction.rs | 26 ++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 08bc6d4a59..aca44718fa 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5308,6 +5308,7 @@ impl Timeline { ctx: &RequestContext, img_range: Range, io_concurrency: IoConcurrency, + progress: Option<(usize, usize)>, ) -> Result { let mut wrote_keys = false; @@ -5384,11 +5385,15 @@ impl Timeline { } } + let progress_report = progress + .map(|(idx, total)| format!("({idx}/{total}) ")) + .unwrap_or_default(); if wrote_keys { // Normal path: we have written some data into the new image layer for this // partition, so flush it to disk. info!( - "produced image layer for rel {}", + "{} produced image layer for rel {}", + progress_report, ImageLayerName { key_range: img_range.clone(), lsn @@ -5398,7 +5403,12 @@ impl Timeline { unfinished_image_layer: image_layer_writer, }) } else { - tracing::debug!("no data in range {}-{}", img_range.start, img_range.end); + tracing::debug!( + "{} no data in range {}-{}", + progress_report, + img_range.start, + img_range.end + ); Ok(ImageLayerCreationOutcome::Empty) } } @@ -5633,7 +5643,8 @@ impl Timeline { } } - for partition in partition_parts.iter() { + let total = partition_parts.len(); + for (idx, partition) in partition_parts.iter().enumerate() { if self.cancel.is_cancelled() { return Err(CreateImageLayersError::Cancelled); } @@ -5718,6 +5729,7 @@ impl Timeline { ctx, img_range.clone(), io_concurrency, + Some((idx, total)), ) .await? } else { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 13a4f82607..43573c28a2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -101,7 +101,11 @@ pub enum GcCompactionQueueItem { /// Whether the compaction is triggered automatically (determines whether we need to update L2 LSN) auto: bool, }, - SubCompactionJob(CompactOptions), + SubCompactionJob { + i: usize, + total: usize, + options: CompactOptions, + }, Notify(GcCompactionJobId, Option), } @@ -163,7 +167,7 @@ impl GcCompactionQueueItem { running, job_id: id.0, }), - GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse { + GcCompactionQueueItem::SubCompactionJob { options, .. } => Some(CompactInfoResponse { compact_key_range: options.compact_key_range, compact_lsn_range: options.compact_lsn_range, sub_compaction: options.sub_compaction, @@ -489,7 +493,7 @@ impl GcCompactionQueue { .map(|job| job.compact_lsn_range.end) .max() .unwrap(); - for job in jobs { + for (i, job) in jobs.into_iter().enumerate() { // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions` // until we do further refactors to allow directly call `compact_with_gc`. let mut flags: EnumSet = EnumSet::default(); @@ -507,7 +511,11 @@ impl GcCompactionQueue { compact_lsn_range: Some(job.compact_lsn_range.into()), sub_compaction_max_job_size_mb: None, }; - pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options)); + pending_tasks.push(GcCompactionQueueItem::SubCompactionJob { + options, + i, + total: jobs_len, + }); } if !auto { @@ -651,7 +659,7 @@ impl GcCompactionQueue { } } } - GcCompactionQueueItem::SubCompactionJob(options) => { + GcCompactionQueueItem::SubCompactionJob { options, i, total } => { // TODO: error handling, clear the queue if any task fails? let _gc_guard = match gc_block.start().await { Ok(guard) => guard, @@ -663,6 +671,7 @@ impl GcCompactionQueue { ))); } }; + info!("running gc-compaction subcompaction job {}/{}", i, total); let res = timeline.compact_with_options(cancel, options, ctx).await; let compaction_result = match res { Ok(res) => res, @@ -1591,13 +1600,15 @@ impl Timeline { let started = Instant::now(); let mut replace_image_layers = Vec::new(); + let total = layers_to_rewrite.len(); - for layer in layers_to_rewrite { + for (i, layer) in layers_to_rewrite.into_iter().enumerate() { if self.cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } - info!(layer=%layer, "rewriting layer after shard split"); + info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total); + let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -4343,6 +4354,7 @@ impl TimelineAdaptor { ctx, key_range.clone(), IoConcurrency::sequential(), + None, ) .await?; From 5ec8881c0bda42db3a735d832171e500dfb11ad8 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 1 Jul 2025 11:11:24 -0700 Subject: [PATCH 08/22] feat(pageserver): resolve feature flag based on remote size (#12400) ## Problem Part of #11813 ## Summary of changes * Compute tenant remote size in the housekeeping loop. * Add a new `TenantFeatureResolver` struct to cache the tenant-specific properties. * Evaluate feature flag based on the remote size. --------- Signed-off-by: Alex Chi Z --- pageserver/src/feature_resolver.rs | 103 ++++++++++++++++++- pageserver/src/http/routes.rs | 12 ++- pageserver/src/tenant.rs | 14 ++- pageserver/src/tenant/timeline.rs | 6 +- pageserver/src/tenant/timeline/compaction.rs | 2 +- 5 files changed, 120 insertions(+), 17 deletions(-) diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 3080b0db34..6ce4522080 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -6,12 +6,13 @@ use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError, PostHogFlagFilterPropertyValue, }; +use rand::Rng; use remote_storage::RemoteStorageKind; use serde_json::json; use tokio_util::sync::CancellationToken; use utils::id::TenantId; -use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; +use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION, tenant::TenantShard}; const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); @@ -138,6 +139,7 @@ impl FeatureResolver { } Arc::new(properties) }; + let fake_tenants = { let mut tenants = Vec::new(); for i in 0..10 { @@ -147,9 +149,16 @@ impl FeatureResolver { conf.id, i ); + + let tenant_properties = PerTenantProperties { + remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)), + } + .into_posthog_properties(); + let properties = Self::collect_properties_inner( distinct_id.clone(), Some(&internal_properties), + &tenant_properties, ); tenants.push(CaptureEvent { event: "initial_tenant_report".to_string(), @@ -183,6 +192,7 @@ impl FeatureResolver { fn collect_properties_inner( tenant_id: String, internal_properties: Option<&HashMap>, + tenant_properties: &HashMap, ) -> HashMap { let mut properties = HashMap::new(); if let Some(internal_properties) = internal_properties { @@ -194,6 +204,9 @@ impl FeatureResolver { "tenant_id".to_string(), PostHogFlagFilterPropertyValue::String(tenant_id), ); + for (key, value) in tenant_properties.iter() { + properties.insert(key.clone(), value.clone()); + } properties } @@ -201,8 +214,13 @@ impl FeatureResolver { pub(crate) fn collect_properties( &self, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> HashMap { - Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref()) + Self::collect_properties_inner( + tenant_id.to_string(), + self.internal_properties.as_deref(), + tenant_properties, + ) } /// Evaluate a multivariate feature flag. Currently, we do not support any properties. @@ -214,6 +232,7 @@ impl FeatureResolver { &self, flag_key: &str, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> Result { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { @@ -224,7 +243,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), - &self.collect_properties(tenant_id), + &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(value) => { @@ -257,6 +276,7 @@ impl FeatureResolver { &self, flag_key: &str, tenant_id: TenantId, + tenant_properties: &HashMap, ) -> Result<(), PostHogEvaluationError> { let force_overrides = self.force_overrides_for_testing.load(); if let Some(value) = force_overrides.get(flag_key) { @@ -271,7 +291,7 @@ impl FeatureResolver { let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), - &self.collect_properties(tenant_id), + &self.collect_properties(tenant_id, tenant_properties), ); match &res { Ok(()) => { @@ -317,3 +337,78 @@ impl FeatureResolver { .store(Arc::new(force_overrides)); } } + +struct PerTenantProperties { + pub remote_size_mb: Option, +} + +impl PerTenantProperties { + pub fn into_posthog_properties(self) -> HashMap { + let mut properties = HashMap::new(); + if let Some(remote_size_mb) = self.remote_size_mb { + properties.insert( + "tenant_remote_size_mb".to_string(), + PostHogFlagFilterPropertyValue::Number(remote_size_mb), + ); + } + properties + } +} + +#[derive(Clone)] +pub struct TenantFeatureResolver { + inner: FeatureResolver, + tenant_id: TenantId, + cached_tenant_properties: Arc>>, +} + +impl TenantFeatureResolver { + pub fn new(inner: FeatureResolver, tenant_id: TenantId) -> Self { + Self { + inner, + tenant_id, + cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), + } + } + + pub fn evaluate_multivariate(&self, flag_key: &str) -> Result { + self.inner.evaluate_multivariate( + flag_key, + self.tenant_id, + &self.cached_tenant_properties.load(), + ) + } + + pub fn evaluate_boolean(&self, flag_key: &str) -> Result<(), PostHogEvaluationError> { + self.inner.evaluate_boolean( + flag_key, + self.tenant_id, + &self.cached_tenant_properties.load(), + ) + } + + pub fn collect_properties(&self) -> HashMap { + self.inner + .collect_properties(self.tenant_id, &self.cached_tenant_properties.load()) + } + + pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result { + self.inner.is_feature_flag_boolean(flag_key) + } + + pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) { + let mut remote_size_mb = None; + for timeline in tenant_shard.list_timelines() { + let size = timeline.metrics.resident_physical_size_get(); + if size == 0 { + remote_size_mb = None; + } + if let Some(ref mut remote_size_mb) = remote_size_mb { + *remote_size_mb += size as f64 / 1024.0 / 1024.0; + } + } + self.cached_tenant_properties.store(Arc::new( + PerTenantProperties { remote_size_mb }.into_posthog_properties(), + )); + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 119275f885..b18b7d6bcd 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3697,23 +3697,25 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); + // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) + // and we don't need to worry about it for now. + let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { - let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); + let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else if as_type.as_deref() == Some("multivariate") { - let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); + let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { // Auto infer the type of the feature flag. let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?; if is_boolean { - let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); + let result = tenant.feature_resolver.evaluate_boolean(&flag); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { - let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); + let result = tenant.feature_resolver.evaluate_multivariate(&flag).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fcb18e8553..3756ebfad9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -86,7 +86,7 @@ use crate::context; use crate::context::RequestContextBuilder; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; -use crate::feature_resolver::FeatureResolver; +use crate::feature_resolver::{FeatureResolver, TenantFeatureResolver}; use crate::l0_flush::L0FlushGlobalState; use crate::metrics::{ BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS, @@ -386,7 +386,7 @@ pub struct TenantShard { l0_flush_global_state: L0FlushGlobalState, - pub(crate) feature_resolver: FeatureResolver, + pub(crate) feature_resolver: TenantFeatureResolver, } impl std::fmt::Debug for TenantShard { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -3263,7 +3263,7 @@ impl TenantShard { }; let gc_compaction_strategy = self .feature_resolver - .evaluate_multivariate("gc-comapction-strategy", self.tenant_shard_id.tenant_id) + .evaluate_multivariate("gc-comapction-strategy") .ok(); let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy { info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy) @@ -3408,6 +3408,9 @@ impl TenantShard { if let Some(ref walredo_mgr) = self.walredo_mgr { walredo_mgr.maybe_quiesce(WALREDO_IDLE_TIMEOUT); } + + // Update the feature resolver with the latest tenant-spcific data. + self.feature_resolver.update_cached_tenant_properties(self); } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -4490,7 +4493,10 @@ impl TenantShard { gc_block: Default::default(), l0_flush_global_state, basebackup_cache, - feature_resolver, + feature_resolver: TenantFeatureResolver::new( + feature_resolver, + tenant_shard_id.tenant_id, + ), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index aca44718fa..443fb7fafb 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -106,7 +106,7 @@ use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32}; -use crate::feature_resolver::FeatureResolver; +use crate::feature_resolver::TenantFeatureResolver; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::l0_flush::{self, L0FlushGlobalState}; use crate::metrics::{ @@ -202,7 +202,7 @@ pub struct TimelineResources { pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, pub basebackup_cache: Arc, - pub feature_resolver: FeatureResolver, + pub feature_resolver: TenantFeatureResolver, } pub struct Timeline { @@ -450,7 +450,7 @@ pub struct Timeline { /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, - feature_resolver: FeatureResolver, + feature_resolver: TenantFeatureResolver, } pub(crate) enum PreviousHeatmap { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 43573c28a2..9b64938b3e 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1319,7 +1319,7 @@ impl Timeline { || cfg!(feature = "testing") || self .feature_resolver - .evaluate_boolean("image-compaction-boundary", self.tenant_shard_id.tenant_id) + .evaluate_boolean("image-compaction-boundary") .is_ok() { let last_repartition_lsn = self.partitioning.read().1; From 8e7ce4222968a75cbaafa390797d638996b659ac Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 2 Jul 2025 09:41:17 +0400 Subject: [PATCH 09/22] tests: start primary compute on not-readonly branches (#12408) ## Problem https://github.com/neondatabase/neon/pull/11712 changed how computes are started in the test: the lsn is specified, making them read-only static replicas. Lsn is `last_record_lsn` from pageserver. It works fine with read-only branches (because their `last_record_lsn` is equal to `start_lsn` and always valid). But with writable timelines, the `last_record_lsn` on the pageserver might be stale. Particularly in this test, after the `detach_branch` operation, the tenant is reset on the pagesever. It leads to `last_record_lsn` going back to `disk_consistent_lsn`, so basically rolling back some recent writes. If we start a primary compute, it will start at safekeepers' commit Lsn, which is the correct one , and will wait till pageserver catches up with this Lsn after reset. - Closes: https://github.com/neondatabase/neon/issues/12365 ## Summary of changes - Start `primary` compute for writable timelines. --- .../regress/test_timeline_detach_ancestor.py | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b5cc431afe..22be3d61ba 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -324,7 +324,7 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) - now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + now = datetime.datetime.now(datetime.UTC) assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: @@ -458,19 +458,20 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots env.pageserver.quiesce_tenants() - # checking the ancestor after is much faster than waiting for the endpoint not start + # checking the ancestor after is much faster than waiting for the endpoint to start expected_result = [ - ("main", env.initial_timeline, None, 24576, 1), - ("after", after, env.initial_timeline, 24576, 1), - ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1), - ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1), - ("branch_to_detach", branch_to_detach, None, 16384, 1), - ("earlier", earlier, env.initial_timeline, 0, 1), + # (branch_name, queried_timeline, expected_ancestor, rows, starts, read_only) + ("main", env.initial_timeline, None, 24576, 1, False), + ("after", after, env.initial_timeline, 24576, 1, False), + ("snapshot_branchpoint_old", snapshot_branchpoint_old, env.initial_timeline, 8192, 1, True), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 16384, 1, False), + ("branch_to_detach", branch_to_detach, None, 16384, 1, False), + ("earlier", earlier, env.initial_timeline, 0, 1, False), ] assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + for branch_name, queried_timeline, expected_ancestor, _, _, _ in expected_result: details = client.timeline_detail(env.initial_tenant, queried_timeline) ancestor_timeline_id = details["ancestor_timeline_id"] if expected_ancestor is None: @@ -508,13 +509,17 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder, snapshots assert len(lineage.get("original_ancestor", [])) == 0 assert len(lineage.get("reparenting_history", [])) == 0 - for branch_name, queried_timeline, _, rows, starts in expected_result: - details = client.timeline_detail(env.initial_tenant, queried_timeline) - log.info(f"reading data from branch {branch_name}") - # specifying the lsn makes the endpoint read-only and not connect to safekeepers + for branch_name, queried_timeline, _, rows, starts, read_only in expected_result: + last_record_lsn = None + if read_only: + # specifying the lsn makes the endpoint read-only and not connect to safekeepers + details = client.timeline_detail(env.initial_tenant, queried_timeline) + last_record_lsn = Lsn(details["last_record_lsn"]) + + log.info(f"reading data from branch {branch_name} at {last_record_lsn}") with env.endpoints.create( branch_name, - lsn=Lsn(details["last_record_lsn"]), + lsn=last_record_lsn, ) as ep: ep.start(safekeeper_generation=1) assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows From 0f879a2e8f9dc703a9a7b53741ef1e7324d459c9 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Wed, 2 Jul 2025 10:55:44 +0200 Subject: [PATCH 10/22] [proxy]: Fix redis IRSA expiration failure errors (#12430) Relates to the [#30688](https://github.com/neondatabase/cloud/issues/30688) --- proxy/src/cancellation.rs | 15 +++++++++++++ .../connection_with_credentials_provider.rs | 22 ++++++++++++++----- proxy/src/redis/kv_ops.rs | 6 ++++- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index ffc0cf43f1..74413f1a7d 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -64,6 +64,13 @@ impl Pipeline { let responses = self.replies; let batch_size = self.inner.len(); + if !client.credentials_refreshed() { + tracing::debug!( + "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." + ); + tokio::time::sleep(Duration::from_secs(5)).await; + } + match client.query(&self.inner).await { // for each reply, we expect that many values. Ok(Value::Array(values)) if values.len() == responses => { @@ -127,6 +134,14 @@ impl QueueProcessing for CancellationProcessor { } async fn apply(&mut self, batch: Vec) -> Vec { + if !self.client.credentials_refreshed() { + // this will cause a timeout for cancellation operations + tracing::debug!( + "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..." + ); + tokio::time::sleep(Duration::from_secs(5)).await; + } + let mut pipeline = Pipeline::with_capacity(batch.len()); let batch_size = batch.len(); diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index fe656557ac..510701cb27 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::sync::{Arc, atomic::AtomicBool, atomic::Ordering}; use std::time::Duration; use futures::FutureExt; @@ -33,6 +33,7 @@ pub struct ConnectionWithCredentialsProvider { con: Option, refresh_token_task: Option>, mutex: tokio::sync::Mutex<()>, + credentials_refreshed: Arc, } impl Clone for ConnectionWithCredentialsProvider { @@ -42,6 +43,7 @@ impl Clone for ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(false)), } } } @@ -65,6 +67,7 @@ impl ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(false)), } } @@ -78,6 +81,7 @@ impl ConnectionWithCredentialsProvider { con: None, refresh_token_task: None, mutex: tokio::sync::Mutex::new(()), + credentials_refreshed: Arc::new(AtomicBool::new(true)), } } @@ -85,6 +89,10 @@ impl ConnectionWithCredentialsProvider { redis::cmd("PING").query_async(con).await } + pub(crate) fn credentials_refreshed(&self) -> bool { + self.credentials_refreshed.load(Ordering::Relaxed) + } + pub(crate) async fn connect(&mut self) -> anyhow::Result<()> { let _guard = self.mutex.lock().await; if let Some(con) = self.con.as_mut() { @@ -112,11 +120,15 @@ impl ConnectionWithCredentialsProvider { if let Credentials::Dynamic(credentials_provider, _) = &self.credentials { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); + let credentials_refreshed = self.credentials_refreshed.clone(); let f = tokio::spawn(async move { - Self::keep_connection(con2, credentials_provider) - .await - .inspect_err(|e| debug!("keep_connection failed: {e}")) - .ok(); + let result = Self::keep_connection(con2, credentials_provider).await; + if let Err(e) = result { + credentials_refreshed.store(false, Ordering::Release); + debug!("keep_connection failed: {e}"); + } else { + credentials_refreshed.store(true, Ordering::Release); + } }); self.refresh_token_task = Some(f); } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index 671fe09b0b..cfdbc21839 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -40,6 +40,10 @@ impl RedisKVClient { .inspect_err(|e| tracing::error!("failed to connect to redis: {e}")) } + pub(crate) fn credentials_refreshed(&self) -> bool { + self.client.credentials_refreshed() + } + pub(crate) async fn query( &mut self, q: &impl Queryable, @@ -49,7 +53,7 @@ impl RedisKVClient { Err(e) => e, }; - tracing::error!("failed to run query: {e}"); + tracing::debug!("failed to run query: {e}"); match e.retry_method() { redis::RetryMethod::Reconnect => { tracing::info!("Redis client is disconnected. Reconnecting..."); From efd7e528128f664d2aeabaf62f6657cd2824e341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 2 Jul 2025 14:06:55 +0200 Subject: [PATCH 11/22] Don't error if timeline offload is already in progress (#12428) Don't print errors like: ``` Compaction failed 1 times, retrying in 2s: Failed to offload timeline: Unexpected offload error: Timeline deletion is already in progress ``` Print it at info log level instead. https://github.com/neondatabase/cloud/issues/30666 --- pageserver/src/http/routes.rs | 1 + pageserver/src/tenant.rs | 1 + pageserver/src/tenant/timeline/offload.rs | 30 ++++++++++++++--------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b18b7d6bcd..02094e6aa9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2438,6 +2438,7 @@ async fn timeline_offload_handler( .map_err(|e| { match e { OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()), + OffloadError::AlreadyInProgress => ApiError::Conflict("Timeline already being offloaded or deleted".into()), _ => ApiError::InternalServerError(anyhow!(e)) } })?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3756ebfad9..f4877fd763 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3285,6 +3285,7 @@ impl TenantShard { .or_else(|err| match err { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), + OffloadError::AlreadyInProgress => Ok(()), err => Err(err), })?; } diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 5920315917..9464f034c7 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -19,6 +19,8 @@ pub(crate) enum OffloadError { NotArchived, #[error(transparent)] RemoteStorage(anyhow::Error), + #[error("Offload or deletion already in progress")] + AlreadyInProgress, #[error("Unexpected offload error: {0}")] Other(anyhow::Error), } @@ -44,20 +46,26 @@ pub(crate) async fn offload_timeline( timeline.timeline_id, TimelineDeleteGuardKind::Offload, ); - if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res { - let is_archived = timeline.is_archived(); - if is_archived == Some(true) { - tracing::error!("timeline is archived but has non-archived children: {children:?}"); + let (timeline, guard) = match delete_guard_res { + Ok(timeline_and_guard) => timeline_and_guard, + Err(DeleteTimelineError::HasChildren(children)) => { + let is_archived = timeline.is_archived(); + if is_archived == Some(true) { + tracing::error!("timeline is archived but has non-archived children: {children:?}"); + return Err(OffloadError::NotArchived); + } + tracing::info!( + ?is_archived, + "timeline is not archived and has unarchived children" + ); return Err(OffloadError::NotArchived); } - tracing::info!( - ?is_archived, - "timeline is not archived and has unarchived children" - ); - return Err(OffloadError::NotArchived); + Err(DeleteTimelineError::AlreadyInProgress(_)) => { + tracing::info!("timeline offload or deletion already in progress"); + return Err(OffloadError::AlreadyInProgress); + } + Err(e) => return Err(OffloadError::Other(anyhow::anyhow!(e))), }; - let (timeline, guard) = - delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; let TimelineOrOffloaded::Timeline(timeline) = timeline else { tracing::error!("timeline already offloaded, but given timeline object"); From d6beb3ffbb8631931c8bae922bec955088354ee5 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 2 Jul 2025 13:46:11 +0100 Subject: [PATCH 12/22] [proxy] rewrite pg-text to json routines (#12413) We would like to move towards an arena system for JSON encoding the responses. This change pushes an "out" parameter into the pg-test to json routines to make swapping in an arena system easier in the future. (see #11992) This additionally removes the redundant `column: &[Type]` argument, as well as rewriting the pg_array parser. --- I rewrote the pg_array parser since while making these changes I found it hard to reason about. I went back to the specification and rewrote it from scratch. There's 4 separate routines: 1. pg_array_parse - checks for any prelude (multidimensional array ranges) 2. pg_array_parse_inner - only deals with the arrays themselves 3. pg_array_parse_item - parses a single item from the array, this might be quoted, unquoted, or another nested array. 4. pg_array_parse_quoted - parses a quoted string, following the relevant string escaping rules. --- proxy/src/serverless/json.rs | 488 ++++++++++++++++---------- proxy/src/serverless/sql_over_http.rs | 5 +- 2 files changed, 304 insertions(+), 189 deletions(-) diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 1afc10359f..2e67d07079 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -70,6 +70,34 @@ pub(crate) enum JsonConversionError { ParseJsonError(#[from] serde_json::Error), #[error("unbalanced array")] UnbalancedArray, + #[error("unbalanced quoted string")] + UnbalancedString, +} + +enum OutputMode { + Array(Vec), + Object(Map), +} + +impl OutputMode { + fn key(&mut self, key: &str) -> &mut Value { + match self { + OutputMode::Array(values) => push_entry(values, Value::Null), + OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null), + } + } + + fn finish(self) -> Value { + match self { + OutputMode::Array(values) => Value::Array(values), + OutputMode::Object(map) => Value::Object(map), + } + } +} + +fn push_entry(arr: &mut Vec, t: T) -> &mut T { + arr.push(t); + arr.last_mut().expect("a value was just inserted") } // @@ -77,182 +105,277 @@ pub(crate) enum JsonConversionError { // pub(crate) fn pg_text_row_to_json( row: &Row, - columns: &[Type], raw_output: bool, array_mode: bool, ) -> Result { - let iter = row - .columns() - .iter() - .zip(columns) - .enumerate() - .map(|(i, (column, typ))| { - let name = column.name(); - let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; - let json_value = if raw_output { - match pg_value { - Some(v) => Value::String(v.to_string()), - None => Value::Null, - } - } else { - pg_text_to_json(pg_value, typ)? - }; - Ok((name.to_string(), json_value)) - }); - - if array_mode { - // drop keys and aggregate into array - let arr = iter - .map(|r| r.map(|(_key, val)| val)) - .collect::, JsonConversionError>>()?; - Ok(Value::Array(arr)) + let mut entries = if array_mode { + OutputMode::Array(Vec::with_capacity(row.columns().len())) } else { - let obj = iter.collect::, JsonConversionError>>()?; - Ok(Value::Object(obj)) + OutputMode::Object(Map::with_capacity(row.columns().len())) + }; + + for (i, column) in row.columns().iter().enumerate() { + let pg_value = row.as_text(i).map_err(JsonConversionError::AsTextError)?; + + let value = entries.key(column.name()); + + match pg_value { + Some(v) if raw_output => *value = Value::String(v.to_string()), + Some(v) => pg_text_to_json(value, v, column.type_())?, + None => *value = Value::Null, + } } + + Ok(entries.finish()) } // // Convert postgres text-encoded value to JSON value // -fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result { - if let Some(val) = pg_value { - if let Kind::Array(elem_type) = pg_type.kind() { - return pg_array_parse(val, elem_type); - } +fn pg_text_to_json( + output: &mut Value, + val: &str, + pg_type: &Type, +) -> Result<(), JsonConversionError> { + if let Kind::Array(elem_type) = pg_type.kind() { + // todo: we should fetch this from postgres. + let delimiter = ','; - match *pg_type { - Type::BOOL => Ok(Value::Bool(val == "t")), - Type::INT2 | Type::INT4 => { - let val = val.parse::()?; - Ok(Value::Number(serde_json::Number::from(val))) - } - Type::FLOAT4 | Type::FLOAT8 => { - let fval = val.parse::()?; - let num = serde_json::Number::from_f64(fval); - if let Some(num) = num { - Ok(Value::Number(num)) - } else { - // Pass Nan, Inf, -Inf as strings - // JS JSON.stringify() does converts them to null, but we - // want to preserve them, so we pass them as strings - Ok(Value::String(val.to_string())) - } - } - Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?), - _ => Ok(Value::String(val.to_string())), - } - } else { - Ok(Value::Null) - } -} - -// -// Parse postgres array into JSON array. -// -// This is a bit involved because we need to handle nested arrays and quoted -// values. Unlike postgres we don't check that all nested arrays have the same -// dimensions, we just return them as is. -// -fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result { - pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v) -} - -fn pg_array_parse_inner( - pg_array: &str, - elem_type: &Type, - nested: bool, -) -> Result<(Value, usize), JsonConversionError> { - let mut pg_array_chr = pg_array.char_indices(); - let mut level = 0; - let mut quote = false; - let mut entries: Vec = Vec::new(); - let mut entry = String::new(); - - // skip bounds decoration - if let Some('[') = pg_array.chars().next() { - for (_, c) in pg_array_chr.by_ref() { - if c == '=' { - break; - } - } + let mut array = vec![]; + pg_array_parse(&mut array, val, elem_type, delimiter)?; + *output = Value::Array(array); + return Ok(()); } - fn push_checked( - entry: &mut String, - entries: &mut Vec, - elem_type: &Type, - ) -> Result<(), JsonConversionError> { - if !entry.is_empty() { - // While in usual postgres response we get nulls as None and everything else - // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while - // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs - // here while we have quotation info and convert them to None. - if entry == "NULL" { - entries.push(pg_text_to_json(None, elem_type)?); + match *pg_type { + Type::BOOL => *output = Value::Bool(val == "t"), + Type::INT2 | Type::INT4 => { + let val = val.parse::()?; + *output = Value::Number(serde_json::Number::from(val)); + } + Type::FLOAT4 | Type::FLOAT8 => { + let fval = val.parse::()?; + let num = serde_json::Number::from_f64(fval); + if let Some(num) = num { + *output = Value::Number(num); } else { - entries.push(pg_text_to_json(Some(entry), elem_type)?); + // Pass Nan, Inf, -Inf as strings + // JS JSON.stringify() does converts them to null, but we + // want to preserve them, so we pass them as strings + *output = Value::String(val.to_string()); } - entry.clear(); } - - Ok(()) + Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?, + _ => *output = Value::String(val.to_string()), } - while let Some((mut i, mut c)) = pg_array_chr.next() { - let mut escaped = false; + Ok(()) +} - if c == '\\' { - escaped = true; - let Some(x) = pg_array_chr.next() else { - return Err(JsonConversionError::UnbalancedArray); - }; - (i, c) = x; - } - - match c { - '{' if !quote => { - level += 1; - if level > 1 { - let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?; - entries.push(res); - for _ in 0..off - 1 { - pg_array_chr.next(); - } - } - } - '}' if !quote => { - level -= 1; - if level == 0 { - push_checked(&mut entry, &mut entries, elem_type)?; - if nested { - return Ok((Value::Array(entries), i)); - } - } - } - '"' if !escaped => { - if quote { - // end of quoted string, so push it manually without any checks - // for emptiness or nulls - entries.push(pg_text_to_json(Some(&entry), elem_type)?); - entry.clear(); - } - quote = !quote; - } - ',' if !quote => { - push_checked(&mut entry, &mut entries, elem_type)?; - } - _ => { - entry.push(c); - } - } +/// Parse postgres array into JSON array. +/// +/// This is a bit involved because we need to handle nested arrays and quoted +/// values. Unlike postgres we don't check that all nested arrays have the same +/// dimensions, we just return them as is. +/// +/// +/// +/// The external text representation of an array value consists of items that are interpreted +/// according to the I/O conversion rules for the array's element type, plus decoration that +/// indicates the array structure. The decoration consists of curly braces (`{` and `}`) around +/// the array value plus delimiter characters between adjacent items. The delimiter character +/// is usually a comma (,) but can be something else: it is determined by the typdelim setting +/// for the array's element type. Among the standard data types provided in the PostgreSQL +/// distribution, all use a comma, except for type box, which uses a semicolon (;). +/// +/// In a multidimensional array, each dimension (row, plane, cube, etc.) +/// gets its own level of curly braces, and delimiters must be written between adjacent +/// curly-braced entities of the same level. +fn pg_array_parse( + elements: &mut Vec, + mut pg_array: &str, + elem: &Type, + delim: char, +) -> Result<(), JsonConversionError> { + // skip bounds decoration, eg: + // `[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}` + // technically these are significant, but we have no way to represent them in json. + if let Some('[') = pg_array.chars().next() { + let Some((_bounds, array)) = pg_array.split_once('=') else { + return Err(JsonConversionError::UnbalancedArray); + }; + pg_array = array; } - if level != 0 { + // whitespace might preceed a `{`. + let pg_array = pg_array.trim_start(); + + let rest = pg_array_parse_inner(elements, pg_array, elem, delim)?; + if !rest.is_empty() { return Err(JsonConversionError::UnbalancedArray); } - Ok((Value::Array(entries), 0)) + Ok(()) +} + +/// reads a single array from the `pg_array` string and pushes each values to `elements`. +/// returns the rest of the `pg_array` string that was not read. +fn pg_array_parse_inner<'a>( + elements: &mut Vec, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // array should have a `{` prefix. + pg_array = pg_array + .strip_prefix('{') + .ok_or(JsonConversionError::UnbalancedArray)?; + + let mut q = String::new(); + + loop { + let value = push_entry(elements, Value::Null); + pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?; + + // check for separator. + if let Some(next) = pg_array.strip_prefix(delim) { + // next item. + pg_array = next; + } else { + break; + } + } + + let Some(next) = pg_array.strip_prefix('}') else { + // missing `}` terminator. + return Err(JsonConversionError::UnbalancedArray); + }; + + // whitespace might follow a `}`. + Ok(next.trim_start()) +} + +/// reads a single item from the `pg_array` string. +/// returns the rest of the `pg_array` string that was not read. +/// +/// `quoted` is a scratch allocation that has no defined output. +fn pg_array_parse_item<'a>( + output: &mut Value, + quoted: &mut String, + mut pg_array: &'a str, + elem: &Type, + delim: char, +) -> Result<&'a str, JsonConversionError> { + // We are trying to parse an array item. + // This could be a new array, if this is a multi-dimentional array. + // This could be a quoted string representing `elem`. + // This could be an unquoted string representing `elem`. + + // whitespace might preceed an item. + pg_array = pg_array.trim_start(); + + if pg_array.starts_with('{') { + // nested array. + let mut nested = vec![]; + pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?; + *output = Value::Array(nested); + return Ok(pg_array); + } + + if let Some(mut pg_array) = pg_array.strip_prefix('"') { + // the parsed string is un-escaped and written into quoted. + pg_array = pg_array_parse_quoted(quoted, pg_array)?; + + // we have un-escaped the string, parse it as pgtext. + pg_text_to_json(output, quoted, elem)?; + + return Ok(pg_array); + } + + // we need to parse an item. read until we find a delimiter or `}`. + let index = pg_array + .find([delim, '}']) + .ok_or(JsonConversionError::UnbalancedArray)?; + + let item; + (item, pg_array) = pg_array.split_at(index); + + // item might have trailing whitespace that we need to ignore. + let item = item.trim_end(); + + // we might have an item string: + // check for null + if item == "NULL" { + *output = Value::Null; + } else { + pg_text_to_json(output, item, elem)?; + } + + Ok(pg_array) +} + +/// reads a single quoted item from the `pg_array` string. +/// +/// Returns the rest of the `pg_array` string that was not read. +/// The output is written into `quoted`. +/// +/// The pg_array string must have a `"` terminator, but the `"` initial value +/// must have already been removed from the input. The terminator is removed. +fn pg_array_parse_quoted<'a>( + quoted: &mut String, + mut pg_array: &'a str, +) -> Result<&'a str, JsonConversionError> { + // The array output routine will put double quotes around element values if they are empty strings, + // contain curly braces, delimiter characters, double quotes, backslashes, or white space, + // or match the word `NULL`. Double quotes and backslashes embedded in element values will be backslash-escaped. + // For numeric data types it is safe to assume that double quotes will never appear, + // but for textual data types one should be prepared to cope with either the presence or absence of quotes. + + quoted.clear(); + + // We write to quoted in chunks terminated by an escape character. + // Eg if we have the input `foo\"bar"`, then we write `foo`, then `"`, then finally `bar`. + + loop { + // we need to parse an chunk. read until we find a '\\' or `"`. + let i = pg_array + .find(['\\', '"']) + .ok_or(JsonConversionError::UnbalancedString)?; + + let chunk: &str; + (chunk, pg_array) = pg_array + .split_at_checked(i) + .expect("i is guaranteed to be in-bounds of pg_array"); + + // push the chunk. + quoted.push_str(chunk); + + // consume the chunk_end character. + let chunk_end: char; + (chunk_end, pg_array) = + split_first_char(pg_array).expect("pg_array should start with either '\\\\' or '\"'"); + + // finished. + if chunk_end == '"' { + // whitespace might follow the '"'. + pg_array = pg_array.trim_start(); + + break Ok(pg_array); + } + + // consume the escaped character. + let escaped: char; + (escaped, pg_array) = + split_first_char(pg_array).ok_or(JsonConversionError::UnbalancedString)?; + + quoted.push(escaped); + } +} + +fn split_first_char(s: &str) -> Option<(char, &str)> { + let mut chars = s.chars(); + let c = chars.next()?; + Some((c, chars.as_str())) } #[cfg(test)] @@ -316,37 +439,33 @@ mod tests { ); } + fn pg_text_to_json(val: &str, pg_type: &Type) -> Value { + let mut v = Value::Null; + super::pg_text_to_json(&mut v, val, pg_type).unwrap(); + v + } + + fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value { + let mut array = vec![]; + super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap(); + Value::Array(array) + } + #[test] fn test_atomic_types_parse() { + assert_eq!(pg_text_to_json("foo", &Type::TEXT), json!("foo")); + assert_eq!(pg_text_to_json("42", &Type::INT4), json!(42)); + assert_eq!(pg_text_to_json("42", &Type::INT2), json!(42)); + assert_eq!(pg_text_to_json("42", &Type::INT8), json!("42")); + assert_eq!(pg_text_to_json("42.42", &Type::FLOAT8), json!(42.42)); + assert_eq!(pg_text_to_json("42.42", &Type::FLOAT4), json!(42.42)); + assert_eq!(pg_text_to_json("NaN", &Type::FLOAT4), json!("NaN")); assert_eq!( - pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(), - json!("foo") - ); - assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42)); - assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42)); - assert_eq!( - pg_text_to_json(Some("42"), &Type::INT8).unwrap(), - json!("42") - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(), - json!(42.42) - ); - assert_eq!( - pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(), - json!("NaN") - ); - assert_eq!( - pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(), + pg_text_to_json("Infinity", &Type::FLOAT4), json!("Infinity") ); assert_eq!( - pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(), + pg_text_to_json("-Infinity", &Type::FLOAT4), json!("-Infinity") ); @@ -355,10 +474,9 @@ mod tests { .unwrap(); assert_eq!( pg_text_to_json( - Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#), + r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#, &Type::JSONB - ) - .unwrap(), + ), json ); } @@ -366,7 +484,7 @@ mod tests { #[test] fn test_pg_array_parse_text() { fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::TEXT).unwrap() + pg_array_parse(pg_arr, &Type::TEXT) } assert_eq!( pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#), @@ -389,7 +507,7 @@ mod tests { #[test] fn test_pg_array_parse_bool() { fn pb(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::BOOL).unwrap() + pg_array_parse(pg_arr, &Type::BOOL) } assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true])); assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]])); @@ -406,7 +524,7 @@ mod tests { #[test] fn test_pg_array_parse_numbers() { fn pn(pg_arr: &str, ty: &Type) -> Value { - pg_array_parse(pg_arr, ty).unwrap() + pg_array_parse(pg_arr, ty) } assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3])); assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3])); @@ -434,7 +552,7 @@ mod tests { #[test] fn test_pg_array_with_decoration() { fn p(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::INT2).unwrap() + pg_array_parse(pg_arr, &Type::INT2) } assert_eq!( p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#), @@ -445,7 +563,7 @@ mod tests { #[test] fn test_pg_array_parse_json() { fn pt(pg_arr: &str) -> Value { - pg_array_parse(pg_arr, &Type::JSONB).unwrap() + pg_array_parse(pg_arr, &Type::JSONB) } assert_eq!(pt(r#"{"{}"}"#), json!([{}])); assert_eq!( diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5d5e7bf83e..18ce03c725 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1135,7 +1135,6 @@ async fn query_to_json( let columns_len = row_stream.statement.columns().len(); let mut fields = Vec::with_capacity(columns_len); - let mut types = Vec::with_capacity(columns_len); for c in row_stream.statement.columns() { fields.push(json!({ @@ -1147,8 +1146,6 @@ async fn query_to_json( "dataTypeModifier": c.type_modifier(), "format": "text", })); - - types.push(c.type_().clone()); } let raw_output = parsed_headers.raw_output; @@ -1170,7 +1167,7 @@ async fn query_to_json( )); } - let row = pg_text_row_to_json(&row, &types, raw_output, array_mode)?; + let row = pg_text_row_to_json(&row, raw_output, array_mode)?; rows.push(row); // assumption: parsing pg text and converting to json takes CPU time. From 0429a0db16ff44e604fa4f3203ad1e75bb6182c0 Mon Sep 17 00:00:00 2001 From: Dmitry Savelev Date: Wed, 2 Jul 2025 18:30:47 +0200 Subject: [PATCH 13/22] Switch the billing metrics storage format to ndjson. (#12427) ## Problem The billing team wants to change the billing events pipeline and use a common events format in S3 buckets across different event producers. ## Summary of changes Change the events storage format for billing events from JSON to NDJSON. Also partition files by hours, rather than days. Resolves: https://github.com/neondatabase/cloud/issues/29995 --- pageserver/src/consumption_metrics/upload.rs | 121 +++++++++++++++++- .../test_pageserver_metric_collection.py | 10 +- 2 files changed, 127 insertions(+), 4 deletions(-) diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index eba773272a..16d42b6fe4 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -99,7 +99,7 @@ pub(super) async fn upload_metrics_bucket( // Compose object path let datetime: DateTime = SystemTime::now().into(); - let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/%H:%M:%SZ"); + let ts_prefix = datetime.format("year=%Y/month=%m/day=%d/hour=%H/%H:%M:%SZ"); let path = RemotePath::from_string(&format!("{ts_prefix}_{node_id}.ndjson.gz"))?; // Set up a gzip writer into a buffer @@ -109,7 +109,7 @@ pub(super) async fn upload_metrics_bucket( // Serialize and write into compressed buffer let started_at = std::time::Instant::now(); - for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) { + for res in serialize_in_chunks_ndjson(CHUNK_SIZE, metrics, idempotency_keys) { let (_chunk, body) = res?; gzip_writer.write_all(&body).await?; } @@ -216,6 +216,86 @@ fn serialize_in_chunks<'a>( } } +/// Serializes the input metrics as NDJSON in chunks of chunk_size. Each event +/// is serialized as a separate JSON object on its own line. The provided +/// idempotency keys are injected into the corresponding metric events (reused +/// across different metrics sinks), and must have the same length as input. +fn serialize_in_chunks_ndjson<'a>( + chunk_size: usize, + input: &'a [NewRawMetric], + idempotency_keys: &'a [IdempotencyKey<'a>], +) -> impl ExactSizeIterator> + 'a +{ + use bytes::BufMut; + + assert_eq!(input.len(), idempotency_keys.len()); + + struct Iter<'a> { + inner: std::slice::Chunks<'a, NewRawMetric>, + idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, + chunk_size: usize, + + // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries + buffer: bytes::BytesMut, + // chunk amount of events are reused to produce the serialized document + scratch: Vec>, + } + + impl<'a> Iterator for Iter<'a> { + type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>; + + fn next(&mut self) -> Option { + let chunk = self.inner.next()?; + + if self.scratch.is_empty() { + // first round: create events with N strings + self.scratch.extend( + chunk + .iter() + .zip(&mut self.idempotency_keys) + .map(|(raw_metric, key)| raw_metric.as_event(key)), + ); + } else { + // next rounds: update_in_place to reuse allocations + assert_eq!(self.scratch.len(), self.chunk_size); + itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) + .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); + } + + // Serialize each event as NDJSON (one JSON object per line) + for event in self.scratch[..chunk.len()].iter() { + let res = serde_json::to_writer((&mut self.buffer).writer(), event); + if let Err(e) = res { + return Some(Err(e)); + } + // Add newline after each event to follow NDJSON format + self.buffer.put_u8(b'\n'); + } + + Some(Ok((chunk, self.buffer.split().freeze()))) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } + } + + impl ExactSizeIterator for Iter<'_> {} + + let buffer = bytes::BytesMut::new(); + let inner = input.chunks(chunk_size); + let idempotency_keys = idempotency_keys.iter(); + let scratch = Vec::new(); + + Iter { + inner, + idempotency_keys, + chunk_size, + buffer, + scratch, + } +} + trait RawMetricExt { fn as_event(&self, key: &IdempotencyKey<'_>) -> Event; fn update_in_place(&self, event: &mut Event, key: &IdempotencyKey<'_>); @@ -479,6 +559,43 @@ mod tests { } } + #[test] + fn chunked_serialization_ndjson() { + let examples = metric_samples(); + assert!(examples.len() > 1); + + let now = Utc::now(); + let idempotency_keys = (0..examples.len()) + .map(|i| FixedGen::new(now, "1", i as u16).generate()) + .collect::>(); + + // Parse NDJSON format - each line is a separate JSON object + let parse_ndjson = |body: &[u8]| -> Vec> { + let body_str = std::str::from_utf8(body).unwrap(); + body_str + .trim_end_matches('\n') + .lines() + .filter(|line| !line.is_empty()) + .map(|line| serde_json::from_str::>(line).unwrap()) + .collect() + }; + + let correct = serialize_in_chunks_ndjson(examples.len(), &examples, &idempotency_keys) + .map(|res| res.unwrap().1) + .flat_map(|body| parse_ndjson(&body)) + .collect::>(); + + for chunk_size in 1..examples.len() { + let actual = serialize_in_chunks_ndjson(chunk_size, &examples, &idempotency_keys) + .map(|res| res.unwrap().1) + .flat_map(|body| parse_ndjson(&body)) + .collect::>(); + + // if these are equal, it means that multi-chunking version works as well + assert_eq!(correct, actual); + } + } + #[derive(Clone, Copy)] struct FixedGen<'a>(chrono::DateTime, &'a str, u16); diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index 474258c9eb..52c33687ae 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -180,7 +180,7 @@ def test_metric_collection( httpserver.check() # Check that at least one bucket output object is present, and that all - # can be decompressed and decoded. + # can be decompressed and decoded as NDJSON. bucket_dumps = {} assert isinstance(env.pageserver_remote_storage, LocalFsStorage) for dirpath, _dirs, files in os.walk(env.pageserver_remote_storage.root): @@ -188,7 +188,13 @@ def test_metric_collection( file_path = os.path.join(dirpath, file) log.info(file_path) if file.endswith(".gz"): - bucket_dumps[file_path] = json.load(gzip.open(file_path)) + events = [] + with gzip.open(file_path, "rt") as f: + for line in f: + line = line.strip() + if line: + events.append(json.loads(line)) + bucket_dumps[file_path] = {"events": events} assert len(bucket_dumps) >= 1 assert all("events" in data for data in bucket_dumps.values()) From 44121cc175e4493c69c13448686a178bb136b6cd Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 2 Jul 2025 19:16:00 +0200 Subject: [PATCH 14/22] docs(compute): RFC for compute rolling restart with prewarm (#11294) ## Problem Neon currently implements several features that guarantee high uptime of compute nodes: 1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure. 2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast. 3. Preemptive NeonVM compute provisioning in case of k8s node unavailability. This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes. During restart, compute looses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches, so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances. [Rendered version](https://github.com/neondatabase/neon/blob/alexk/pg-prewarm-rfc/docs/rfcs/2025-03-17-compute-prewarm.md) Part of https://github.com/neondatabase/cloud/issues/19011 --- docs/rfcs/2025-03-17-compute-prewarm.md | 399 ++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 docs/rfcs/2025-03-17-compute-prewarm.md diff --git a/docs/rfcs/2025-03-17-compute-prewarm.md b/docs/rfcs/2025-03-17-compute-prewarm.md new file mode 100644 index 0000000000..6e95b9ac39 --- /dev/null +++ b/docs/rfcs/2025-03-17-compute-prewarm.md @@ -0,0 +1,399 @@ +# Compute rolling restart with prewarm + +Created on 2025-03-17 +Implemented on _TBD_ +Author: Alexey Kondratov (@ololobus) + +## Summary + +This RFC describes an approach to reduce performance degradation due to missing caches after compute node restart, i.e.: + +1. Rolling restart of the running instance via 'warm' replica. +2. Auto-prewarm compute caches after unplanned restart or scale-to-zero. + +## Motivation + +Neon currently implements several features that guarantee high uptime of compute nodes: + +1. Storage high-availability (HA), i.e. each tenant shard has a secondary pageserver location, so we can quickly switch over compute to it in case of primary pageserver failure. +2. Fast compute provisioning, i.e. we have a fleet of pre-created empty computes, that are ready to serve workload, so restarting unresponsive compute is very fast. +3. Preemptive NeonVM compute provisioning in case of k8s node unavailability. + +This helps us to be well-within the uptime SLO of 99.95% most of the time. Problems begin when we go up to multi-TB workloads and 32-64 CU computes. +During restart, compute loses all caches: LFC, shared buffers, file system cache. Depending on the workload, it can take a lot of time to warm up the caches, +so that performance could be degraded and might be even unacceptable for certain workloads. The latter means that although current approach works well for small to +medium workloads, we still have to do some additional work to avoid performance degradation after restart of large instances. + +## Non Goals + +- Details of the persistence storage for prewarm data are out of scope, there is a separate RFC for that: . +- Complete compute/Postgres HA setup and flow. Although it was originally in scope of this RFC, during preliminary research it appeared to be a rabbit hole, so it's worth of a separate RFC. +- Low-level implementation details for Postgres replica-to-primary promotion. There are a lot of things to think and care about: how to start walproposer, [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html), and so on, but it's worth of at least a separate one-pager design document if not RFC. + +## Impacted components + +Postgres, compute_ctl, Control plane, Endpoint storage for unlogged storage of compute files. +For the latter, we will need to implement a uniform abstraction layer on top of S3, ABS, etc., but +S3 is used in text interchangeably with 'endpoint storage' for simplicity. + +## Proposed implementation + +### compute_ctl spec changes and auto-prewarm + +We are going to extend the current compute spec with the following attributes + +```rust +struct ComputeSpec { + /// [All existing attributes] + ... + /// Whether to do auto-prewarm at start or not. + /// Default to `false`. + pub lfc_auto_prewarm: bool + /// Interval in seconds between automatic dumps of + /// LFC state into S3. Default `None`, which means 'off'. + pub lfc_dump_interval_sec: Option +} +``` + +When `lfc_dump_interval_sec` is set to `N`, `compute_ctl` will periodically dump the LFC state +and store it in S3, so that it could be used either for auto-prewarm after restart or by replica +during the rolling restart. For enabling periodic dumping, we should consider the following value +`lfc_dump_interval_sec=300` (5 minutes), same as in the upstream's `pg_prewarm.autoprewarm_interval`. + +When `lfc_auto_prewarm` is set to `true`, `compute_ctl` will start prewarming the LFC upon restart +iif some of the previous states is present in S3. + +### compute_ctl API + +1. `POST /store_lfc_state` -- dump LFC state using Postgres SQL interface and store result in S3. + This has to be a blocking call, i.e. it will return only after the state is stored in S3. + If there is any concurrent request in progress, we should return `429 Too Many Requests`, + and let the caller to retry. + +2. `GET /dump_lfc_state` -- dump LFC state using Postgres SQL interface and return it as is + in text format suitable for the future restore/prewarm. This API is not strictly needed at + the end state, but could be useful for a faster prototyping of a complete rolling restart flow + with prewarm, as it doesn't require persistent for LFC state storage. + +3. `POST /restore_lfc_state` -- restore/prewarm LFC state with request + + ```yaml + RestoreLFCStateRequest: + oneOf: + - type: object + required: + - lfc_state + properties: + lfc_state: + type: string + description: Raw LFC content dumped with GET `/dump_lfc_state` + - type: object + required: + - lfc_cache_key + properties: + lfc_cache_key: + type: string + description: | + endpoint_id of the source endpoint on the same branch + to use as a 'donor' for LFC content. Compute will look up + LFC content dump in S3 using this key and do prewarm. + ``` + + where `lfc_state` and `lfc_cache_key` are mutually exclusive. + + The actual prewarming will happen asynchronously, so the caller need to check the + prewarm status using the compute's standard `GET /status` API. + +4. `GET /status` -- extend existing API with following attributes + + ```rust + struct ComputeStatusResponse { + // [All existing attributes] + ... + pub prewarm_state: PrewarmState + } + + /// Compute prewarm state. Will be stored in the shared Compute state + /// in compute_ctl + struct PrewarmState { + pub status: PrewarmStatus + /// Total number of pages to prewarm + pub pages_total: i64 + /// Number of pages prewarmed so far + pub pages_processed: i64 + /// Optional prewarm error + pub error: Option + } + + pub enum PrewarmStatus { + /// Prewarming was never requested on this compute + Off, + /// Prewarming was requested, but not started yet + Pending, + /// Prewarming is in progress. The caller should follow + /// `PrewarmState::progress`. + InProgress, + /// Prewarming has been successfully completed + Completed, + /// Prewarming failed. The caller should look at + /// `PrewarmState::error` for the reason. + Failed, + /// It is intended to be used by auto-prewarm if none of + /// the previous LFC states is available in S3. + /// This is a distinct state from the `Failed` because + /// technically it's not a failure and could happen if + /// compute was restart before it dumped anything into S3, + /// or just after the initial rollout of the feature. + Skipped, + } + ``` + +5. `POST /promote` -- this is a **blocking** API call to promote compute replica into primary. + This API should be very similar to the existing `POST /configure` API, i.e. accept the + spec (primary spec, because originally compute was started as replica). It's a distinct + API method because semantics and response codes are different: + + - If promotion is done successfully, it will return `200 OK`. + - If compute is already primary, the call will be no-op and `compute_ctl` + will return `412 Precondition Failed`. + - If, for some reason, second request reaches compute that is in progress of promotion, + it will respond with `429 Too Many Requests`. + - If compute hit any permanent failure during promotion `500 Internal Server Error` + will be returned. + +### Control plane operations + +The complete flow will be present as a sequence diagram in the next section, but here +we just want to list some important steps that have to be done by control plane during +the rolling restart via warm replica, but without much of low-level implementation details. + +1. Register the 'intent' of the instance restart, but not yet interrupt any workload at + primary and also accept new connections. This may require some endpoint state machine + changes, e.g. introduction of the `pending_restart` state. Being in this state also + **mustn't prevent any other operations except restart**: suspend, live-reconfiguration + (e.g. due to notify-attach call from the storage controller), deletion. + +2. Start new replica compute on the same timeline and start prewarming it. This process + may take quite a while, so the same concurrency considerations as in 1. should be applied + here as well. + +3. When warm replica is ready, control plane should: + + 3.1. Terminate the primary compute. Starting from here, **this is a critical section**, + if anything goes off, the only option is to start the primary normally and proceed + with auto-prewarm. + + 3.2. Send cache invalidation message to all proxies, notifying them that all new connections + should request and wait for the new connection details. At this stage, proxy has to also + drop any existing connections to the old primary, so they didn't do stale reads. + + 3.3. Attach warm replica compute to the primary endpoint inside control plane metadata + database. + + 3.4. Promote replica to primary. + + 3.5. When everything is done, finalize the endpoint state to be just `active`. + +### Complete rolling restart flow + +```mermaid + sequenceDiagram + + autonumber + + participant proxy as Neon proxy + + participant cplane as Control plane + + participant primary as Compute (primary) + box Compute (replica) + participant ctl as compute_ctl + participant pg as Postgres + end + + box Endpoint unlogged storage + participant s3proxy as Endpoint storage service + participant s3 as S3/ABS/etc. + end + + + cplane ->> primary: POST /store_lfc_state + primary -->> cplane: 200 OK + + cplane ->> ctl: POST /restore_lfc_state + activate ctl + ctl -->> cplane: 202 Accepted + + activate cplane + cplane ->> ctl: GET /status: poll prewarm status + ctl ->> s3proxy: GET /read_file + s3proxy ->> s3: read file + s3 -->> s3proxy: file content + s3proxy -->> ctl: 200 OK: file content + + proxy ->> cplane: GET /proxy_wake_compute + cplane -->> proxy: 200 OK: old primary conninfo + + ctl ->> pg: prewarm LFC + activate pg + pg -->> ctl: prewarm is completed + deactivate pg + + ctl -->> cplane: 200 OK: prewarm is completed + deactivate ctl + deactivate cplane + + cplane -->> cplane: reassign replica compute to endpoint,
start terminating the old primary compute + activate cplane + cplane ->> proxy: invalidate caches + + proxy ->> cplane: GET /proxy_wake_compute + + cplane -x primary: POST /terminate + primary -->> cplane: 200 OK + note over primary: old primary
compute terminated + + cplane ->> ctl: POST /promote + activate ctl + ctl ->> pg: pg_ctl promote + activate pg + pg -->> ctl: done + deactivate pg + ctl -->> cplane: 200 OK + deactivate ctl + + cplane -->> cplane: finalize operation + cplane -->> proxy: 200 OK: new primary conninfo + deactivate cplane +``` + +### Network bandwidth and prewarm speed + +It's currently known that pageserver can sustain about 3000 RPS per shard for a few running computes. +Large tenants are usually split into 8 shards, so the final formula may look like this: + +```text +8 shards * 3000 RPS * 8 KB =~ 190 MB/s +``` + +so depending on the LFC size, prewarming will take at least: + +- ~5s for 1 GB +- ~50s for 10 GB +- ~5m for 100 GB +- \>1h for 1 TB + +In total, one pageserver is normally capped by 30k RPS, so it obviously can't sustain many computes +doing prewarm at the same time. Later, we may need an additional mechanism for computes to throttle +the prewarming requests gracefully. + +### Reliability, failure modes and corner cases + +We consider following failures while implementing this RFC: + +1. Compute got interrupted/crashed/restarted during prewarm. The caller -- control plane -- should + detect that and start prewarm from the beginning. + +2. Control plane promotion request timed out or hit network issues. If it never reached the + compute, control plane should just repeat it. If it did reach the compute, then during + retry control plane can hit `409` as previous request triggered the promotion already. + In this case, control plane need to retry until either `200` or + permanent error `500` is returned. + +3. Compute got interrupted/crashed/restarted during promotion. At restart it will ask for + a spec from control plane, and its content should signal compute to start as **primary**, + so it's expected that control plane will continue polling for certain period of time and + will discover that compute is ready to accept connections if restart is fast enough. + +4. Any other unexpected failure or timeout during prewarming. This **failure mustn't be fatal**, + control plane has to report failure, terminate replica and keep primary running. + +5. Any other unexpected failure or timeout during promotion. Unfortunately, at this moment + we already have the primary node stopped, so the only option is to start primary again + and proceed with auto-prewarm. + +6. Any unexpected failure during auto-prewarm. This **failure mustn't be fatal**, + `compute_ctl` has to report the failure, but do not crash the compute. + +7. Control plane failed to confirm that old primary has terminated. This can happen, especially + in the future HA setup. In this case, control plane has to ensure that it sent VM deletion + and pod termination requests to k8s, so long-term we do not have two running primaries + on the same timeline. + +### Security implications + +There are two security implications to consider: + +1. Access to `compute_ctl` API. It has to be accessible from the outside of compute, so all + new API methods have to be exposed on the **external** HTTP port and **must** be authenticated + with JWT. + +2. Read/write only your own LFC state data in S3. Although it's not really a security concern, + since LFC state is just a mapping of blocks present in LFC at certain moment in time; + it still has to be highly restricted, so that i) only computes on the same timeline can + read S3 state; ii) each compute can only write to the path that contains it's `endpoint_id`. + Both of this must be validated by Endpoint storage service using the JWT token provided by `compute_ctl`. + +### Unresolved questions + +#### Billing, metrics and monitoring + +Currently, we only label computes with `endpoint_id` after attaching them to the endpoint. +In this proposal, this means that temporary replica will remain unlabelled until it's promoted +to primary. We can also hide it from users in the control plane API, but what to do with +billing and monitoring is still unclear. + +We can probably mark it as 'billable' and tag with `project_id`, so it will be billed, but +not interfere in any way with the current primary monitoring. + +Another thing to consider is how logs and metrics export will switch to the new compute. +It's expected that OpenTelemetry collector will auto-discover the new compute and start +scraping metrics from it. + +#### Auto-prewarm + +It's still an open question whether we need auto-prewarm at all. The author's gut-feeling is +that yes, we need it, but might be not for all workloads, so it could end up exposed as a +user-controllable knob on the endpoint. There are two arguments for that: + +1. Auto-prewarm existing in upstream's `pg_prewarm`, _probably for a reason_. + +2. There are still could be 2 flows when we cannot perform the rolling restart via the warm + replica: i) any failure or interruption during promotion; ii) wake up after scale-to-zero. + The latter might be challenged as well, i.e. one can argue that auto-prewarm may and will + compete with user-workload for storage resources. This is correct, but it might as well + reduce the time to get warm LFC and good performance. + +#### Low-level details of the replica promotion + +There are many things to consider here, but three items just off the top of my head: + +1. How to properly start the `walproposer` inside Postgres. + +2. What to do with logical replication. Currently, we do not include logical replication slots + inside basebackup, because nobody advances them at replica, so they just prevent the WAL + deletion. Yet, we do need to have them at primary after promotion. Starting with Postgres 17, + there is a new feature called + [logical replication failover](https://www.postgresql.org/docs/current/logical-replication-failover.html) + and `synchronized_standby_slots` setting, but we need a plan for the older versions. Should we + request a new basebackup during promotion? + +3. How do we guarantee that replica will receive all the latest WAL from safekeepers? Do some + 'shallow' version of sync safekeepers without data copying? Or just a standard version of + sync safekeepers? + +## Alternative implementation + +The proposal already assumes one of the alternatives -- do not have any persistent storage for +LFC state. This is possible to implement faster with the proposed API, but it means that +we do not implement auto-prewarm yet. + +## Definition of Done + +At the end of implementing this RFC we should have two high-level settings that enable: + +1. Auto-prewarm of user computes upon restart. +2. Perform primary compute restart via the warm replica promotion. + +It also has to be decided what's the criteria for enabling one or both of these flows for +certain clients. From a9a51c038b3bb0413312d374774ca1200c2e1052 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 2 Jul 2025 10:41:36 -0700 Subject: [PATCH 15/22] rfc: storage feature flags (#11805) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes --------- Signed-off-by: Alex Chi Z --- docs/rfcs/044-feature-flag.md | 179 ++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 docs/rfcs/044-feature-flag.md diff --git a/docs/rfcs/044-feature-flag.md b/docs/rfcs/044-feature-flag.md new file mode 100644 index 0000000000..3a0fe91a13 --- /dev/null +++ b/docs/rfcs/044-feature-flag.md @@ -0,0 +1,179 @@ +# Storage Feature Flags + +In this RFC, we will describe how we will implement per-tenant feature flags. + +## PostHog as Feature Flag Service + +Before we start, let's talk about how current feature flag services work. PostHog is the feature flag service we are currently using across multiple user-facing components in the company. PostHog has two modes of operation: HTTP evaluation and server-side local evaluation. + +Let's assume we have a storage feature flag called gc-compaction and we want to roll it out to scale-tier users with resident size >= 10GB and <= 100GB. + +### Define User Profiles + +The first step is to synchronize our user profiles to the PostHog service. We can simply assume that each tenant is a user in PostHog. Each user profile has some properties associated with it. In our case, it will be: plan type (free, scale, enterprise, etc); resident size (in bytes); primary pageserver (string); region (string). + +### Define Feature Flags + +We would create a feature flag called gc-compaction in PostHog with 4 variants: disabled, stage-1, stage-2, fully-enabled. We will flip the feature flags from disabled to fully-enabled stage by stage for some percentage of our users. + +### Option 1: HTTP Evaluation Mode + +When using PostHog's HTTP evaluation mode, the client will make request to the PostHog service, asking for the value of a feature flag for a specific user. + +* Control plane will report the plan type to PostHog each time it attaches a tenant to the storcon or when the user upgrades/downgrades. It calls the PostHog profile API to associate tenant ID with the plan type. Assume we have X active tenants and such attach or plan change event happens each week, that would be 4X profile update requests per month. +* Pageservers will report the resident size and the primary pageserver to the PostHog service. Assume we report resident size every 24 hours, that would be 30X requests per month. +* Each tenant will request the state of the feature flag every 1 hour, that's 720X requests per month. +* The Rust client would be easy to implement as we only need to call the `/decide` API on PostHog. + +Using the HTTP evaluation mode we will issue 754X requests a month. + +### Option 2: Local Evaluation Mode + +When using PostHog's HTTP evaluation mode, the client (usually the server in a browser/server architecture) will poll the feature flag configuration every 30s (default in the Python client) from PostHog. Such configuration contains data like: + +
+ +Example JSON response from the PostHog local evaluation API + +``` +[ + { + "id": 1, + "name": "Beta Feature", + "key": "person-flag", + "is_simple_flag": True, + "active": True, + "filters": { + "groups": [ + { + "properties": [ + { + "key": "location", + "operator": "exact", + "value": ["Straße"], + "type": "person", + } + ], + "rollout_percentage": 100, + }, + { + "properties": [ + { + "key": "star", + "operator": "exact", + "value": ["ſun"], + "type": "person", + } + ], + "rollout_percentage": 100, + }, + ], + }, + } +] +``` + +
+ +Note that the API only contains information like "under what condition => rollout percentage". The user is responsible to provide the properties required to the client for local evaluation, and the PostHog service (web UI) cannot know if a feature is enabled for the tenant or not until the client uses the `capture` API to report the result back. To control the rollout percentage, the user ID gets mapped to a float number in `[0, 1)` on a consistent hash ring. All values <= the percentage will get the feature enabled or set to the desired value. + +To use the local evaluation mode, the system needs: + +* Assume each pageserver will poll PostHog for the local evaluation JSON every 5 minutes (instead of the 30s default as it's too frequent). That's 8640Y per month, Y is the number of pageservers. Local evaluation requests cost 10x more than the normal decide request, so that's 86400Y request units to bill. +* Storcon needs to store the plan type in the database and pass that information to the pageserver when attaching the tenant. +* Storcon also needs to update PostHog with the active tenants, for example, when the tenant gets detached/attached. Assume each active tenant gets detached/attached every week, that would be 4X requests per month. +* We do not need to update bill type or resident size to PostHog as all these are evaluated locally. +* After each local evaluation of the feature flag, we need to call PostHog's capture event API to update the result of the evaluation that the feature is enabled. We can do this when the flag gets changed compared with the last cached state in memory. That would be at least 4X (assume we do deployment every week so the cache gets cleared) and maybe an additional multiplifier of 10 assume we have 10 active features. + +In this case, we will issue 86400Y + 40X requests per month. + +Assume X = 1,000,000 and Y = 100, + +| | HTTP Evaluation | Local Evaluation | +|---|---|---| +| Latency of propagating the conditions/properties for feature flag | 24 hours | available locally | +| Latency of applying the feature flag | 1 hour | 5 minutes | +| Can properties be reported from different services | Yes | No | +| Do we need to sync billing info etc to pageserver | No | Yes | +| Cost | 75400$ / month | 4864$ / month | + +# Our Solution + +We will use PostHog _only_ as an UI to configure the feature flags. Whether a feature is enabled or not can only be queried through storcon/pageserver instead of using the PostHog UI. (We could report it back to PostHog via `capture_event` but it costs $$$.) This allows us to ramp up the feature flag functionality fast at first. At the same time, it would also give us the option to migrate to our own solution once we want to have more properties and more complex evaluation rules in our system. + +* We will create several fake users (tenants) in PostHog that contains all the properties we will use for evaluating a feature flag (i.e., resident size, billing type, pageserver id, etc.) +* We will use PostHog's local evaluation API to poll the configuration of the feature flags and evaluate them locally on each of the pageserver. +* The evaluation result will not be reported back to PostHog. +* Storcon needs to pull some information from cplane database. +* To know if a feature is currently enabled or not, we need to call the storcon/pageserver API; and we won't be able to know if a feature has been enabled on a tenant before easily: we need to look at the Grafana logs. + +We only need to pay for the 86400Y local evaluation requests (that would be setting Y=0 in solution 2 => $864/month, and even less if we proxy it through storcon). + +## Implementation + +* Pageserver: implement a PostHog local evaluation client. The client will be shared across all tenants on the pageserver with a single API: `evaluate(tenant_id, feature_flag, properties) -> json`. +* Storcon: if we need plan type as the evaluation condition, pull it from cplane database. +* Storcon/Pageserver: implement an HTTP API `:tenant_id/feature/:feature` to retrieve the current feature flag status. +* Storcon/Pageserver: a loop to update the feature flag spec on both storcon and pageserver. Pageserver loop will only be activated if storcon does not push the specs to the pageserver. + +## Difference from Tenant Config + +* Feature flags can be modified by percentage, and the default config for each feature flag can be modified in UI without going through the release process. +* Feature flags are more flexible and won't be persisted anywhere and will be passed as plain JSON over the wire so that do not need to handle backward/forward compatibility as in tenant config. +* The expectation of tenant config is that once we add a flag we cannot remove it (or it will be hard to remove), but feature flags are more flexible. + +# Final Implementation + +* We added a new crate `posthog_lite_client` that supports local feature evaluations. +* We set up two projects "Storage (staging)" and "Storage (production)" in the PostHog console. +* Each pageserver reports 10 fake tenants to PostHog so that we can get all combinations of regions (and other properties) in the PostHog UI. +* Supported properties: AZ, neon_region, pageserver, tenant_id. +* You may use "Pageserver Feature Flags" dashboard to see the evaluation status. +* The feature flag spec is polled on storcon every 30s (in each of the region) and storcon will propagate the spec to the pageservers. +* The pageserver housekeeping loop updates the tenant-specific properties (e.g., remote size) for evaluation. + +Each tenant has a `feature_resolver` object. After you add a feature flag in the PostHog console, you can retrieve it with: + +```rust +// Boolean flag +self + .feature_resolver + .evaluate_boolean("flag") + .is_ok() +// Multivariate flag +self + .feature_resolver + .evaluate_multivariate("gc-comapction-strategy") + .ok(); +``` + +The user needs to handle the case where the evaluation result is an error. This can occur in a variety of cases: + +* During the pageserver start, the feature flag spec has not been retrieved. +* No condition group is matched. +* The feature flag spec contains an operand/operation not supported by the lite PostHog library. + +For boolean flags, the return value is `Result<(), Error>`. `Ok(())` means the flag is evaluated to true. Otherwise, +there is either an error in evaluation or it does not match any groups. + +For multivariate flags, the return value is `Result`. `Ok(variant)` indicates the flag is evaluated +to a variant. Otherwise, there is either an error in evaluation or it does not match any groups. + +The evaluation logic is documented in the PostHog lite library. It compares the consistent hash of a flag key + tenant_id +with the rollout percentage and determines which tenant to roll out a specific feature. + +Users can use the feature flag evaluation API to get the flag evaluation result of a specific tenant for debugging purposes. + +``` +curl http://localhost:9898/v1/tenant/:tenant_id/feature_flag?flag=:key&as=multivariate/boolean" +``` + +By default, the storcon pushes the feature flag specs to the pageservers every 30 seconds, which means that a change in feature flag in the +PostHog UI will propagate to the pageservers within 30 seconds. + +# Future Works + +* Support dynamic tenant properties like logical size as the evaluation condition. +* Support properties like `plan_type` (needs cplane to pass it down). +* Report feature flag evaluation result back to PostHog (if the cost is okay). +* Fast feature flag evaluation cache on critical paths (e.g., cache a feature flag result in `AtomicBool` and use it on the read path). \ No newline at end of file From 8b4fbefc2945c930a1778fb3ae2a7602585352fe Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 2 Jul 2025 14:54:47 -0500 Subject: [PATCH 16/22] Patch pgaudit to disable logging in parallel workers (#12325) We want to turn logging in parallel workers off to reduce log amplification in queries which use parallel workers. Part-of: https://github.com/neondatabase/cloud/issues/28483 Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 4 +- .../pgaudit-parallel_workers-v14.patch | 143 ++++++++++++++++++ .../pgaudit-parallel_workers-v15.patch | 143 ++++++++++++++++++ .../pgaudit-parallel_workers-v16.patch | 143 ++++++++++++++++++ .../pgaudit-parallel_workers-v17.patch | 143 ++++++++++++++++++ 5 files changed, 575 insertions(+), 1 deletion(-) create mode 100644 compute/patches/pgaudit-parallel_workers-v14.patch create mode 100644 compute/patches/pgaudit-parallel_workers-v15.patch create mode 100644 compute/patches/pgaudit-parallel_workers-v16.patch create mode 100644 compute/patches/pgaudit-parallel_workers-v17.patch diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 111e64d5d1..9f4e3e7d5e 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1572,6 +1572,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \ FROM build-deps AS pgaudit-src ARG PG_VERSION WORKDIR /ext-src +COPY "compute/patches/pgaudit-parallel_workers-${PG_VERSION}.patch" . RUN case "${PG_VERSION}" in \ "v14") \ export PGAUDIT_VERSION=1.6.3 \ @@ -1594,7 +1595,8 @@ RUN case "${PG_VERSION}" in \ esac && \ wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \ echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \ - mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . + mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C . && \ + patch -p1 < "/ext-src/pgaudit-parallel_workers-${PG_VERSION}.patch" FROM pg-build AS pgaudit-build COPY --from=pgaudit-src /ext-src/ /ext-src/ diff --git a/compute/patches/pgaudit-parallel_workers-v14.patch b/compute/patches/pgaudit-parallel_workers-v14.patch new file mode 100644 index 0000000000..5517d3105b --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v14.patch @@ -0,0 +1,143 @@ +commit 7220bb3a3f23fa27207d77562dcc286f9a123313 +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index baa8011..a601375 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2563,6 +2563,37 @@ COMMIT; + NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, + DROP TABLE part_test; + NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 5e6fd38..ac9ded2 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index cc1374a..1870a60 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1612,6 +1612,36 @@ COMMIT; + + DROP TABLE part_test; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v15.patch b/compute/patches/pgaudit-parallel_workers-v15.patch new file mode 100644 index 0000000000..6dfffbd0dd --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v15.patch @@ -0,0 +1,143 @@ +commit 29dc2847f6255541992f18faf8a815dfab79631a +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index b22560b..73f0327 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2563,6 +2563,37 @@ COMMIT; + NOTICE: AUDIT: SESSION,12,4,MISC,COMMIT,,,COMMIT;, + DROP TABLE part_test; + NOTICE: AUDIT: SESSION,13,1,DDL,DROP TABLE,,,DROP TABLE part_test;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,14,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 5e6fd38..ac9ded2 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1303,7 +1304,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1384,7 +1385,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1438,7 +1439,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1458,7 +1459,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index 8052426..7f0667b 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1612,6 +1612,36 @@ COMMIT; + + DROP TABLE part_test; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v16.patch b/compute/patches/pgaudit-parallel_workers-v16.patch new file mode 100644 index 0000000000..6b8b276b7b --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v16.patch @@ -0,0 +1,143 @@ +commit cc708dde7ef2af2a8120d757102d2e34c0463a0f +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index 8772054..9b66ac6 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2556,6 +2556,37 @@ DROP SERVER fdw_server; + NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server;, + DROP EXTENSION postgres_fdw; + NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw;, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test;, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 004d1f9..f061164 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1339,7 +1340,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit even onto the stack */ + stackItem = stack_push(); +@@ -1420,7 +1421,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1475,7 +1476,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1495,7 +1496,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index 6aae88b..de6d7fd 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1631,6 +1631,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; + DROP SERVER fdw_server; + DROP EXTENSION postgres_fdw; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; diff --git a/compute/patches/pgaudit-parallel_workers-v17.patch b/compute/patches/pgaudit-parallel_workers-v17.patch new file mode 100644 index 0000000000..f99be10c60 --- /dev/null +++ b/compute/patches/pgaudit-parallel_workers-v17.patch @@ -0,0 +1,143 @@ +commit 8d02e4c6c5e1e8676251b0717a46054267091cb4 +Author: Tristan Partin +Date: 2025-06-23 02:09:31 +0000 + + Disable logging in parallel workers + + When a query uses parallel workers, pgaudit will log the same query for + every parallel worker. This is undesireable since it can result in log + amplification for queries that use parallel workers. + + Signed-off-by: Tristan Partin + +diff --git a/expected/pgaudit.out b/expected/pgaudit.out +index d696287..4b1059a 100644 +--- a/expected/pgaudit.out ++++ b/expected/pgaudit.out +@@ -2568,6 +2568,37 @@ DROP SERVER fdw_server; + NOTICE: AUDIT: SESSION,11,1,DDL,DROP SERVER,,,DROP SERVER fdw_server, + DROP EXTENSION postgres_fdw; + NOTICE: AUDIT: SESSION,12,1,DDL,DROP EXTENSION,,,DROP EXTENSION postgres_fdw, ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++SELECT count(*) FROM parallel_test; ++NOTICE: AUDIT: SESSION,13,1,READ,SELECT,,,SELECT count(*) FROM parallel_test, ++ count ++------- ++ 1000 ++(1 row) ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; +diff --git a/pgaudit.c b/pgaudit.c +index 1764af1..0e48875 100644 +--- a/pgaudit.c ++++ b/pgaudit.c +@@ -11,6 +11,7 @@ + #include "postgres.h" + + #include "access/htup_details.h" ++#include "access/parallel.h" + #include "access/sysattr.h" + #include "access/xact.h" + #include "access/relation.h" +@@ -1406,7 +1407,7 @@ pgaudit_ExecutorStart_hook(QueryDesc *queryDesc, int eflags) + { + AuditEventStackItem *stackItem = NULL; + +- if (!internalStatement) ++ if (!internalStatement && !IsParallelWorker()) + { + /* Push the audit event onto the stack */ + stackItem = stack_push(); +@@ -1489,7 +1490,7 @@ pgaudit_ExecutorCheckPerms_hook(List *rangeTabls, List *permInfos, bool abort) + + /* Log DML if the audit role is valid or session logging is enabled */ + if ((auditOid != InvalidOid || auditLogBitmap != 0) && +- !IsAbortedTransactionBlockState()) ++ !IsAbortedTransactionBlockState() && !IsParallelWorker()) + { + /* If auditLogRows is on, wait for rows processed to be set */ + if (auditLogRows && auditEventStack != NULL) +@@ -1544,7 +1545,7 @@ pgaudit_ExecutorRun_hook(QueryDesc *queryDesc, ScanDirection direction, uint64 c + else + standard_ExecutorRun(queryDesc, direction, count, execute_once); + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +@@ -1564,7 +1565,7 @@ pgaudit_ExecutorEnd_hook(QueryDesc *queryDesc) + AuditEventStackItem *stackItem = NULL; + AuditEventStackItem *auditEventStackFull = NULL; + +- if (auditLogRows && !internalStatement) ++ if (auditLogRows && !internalStatement && !IsParallelWorker()) + { + /* Find an item from the stack by the query memory context */ + stackItem = stack_find_context(queryDesc->estate->es_query_cxt); +diff --git a/sql/pgaudit.sql b/sql/pgaudit.sql +index e161f01..c873098 100644 +--- a/sql/pgaudit.sql ++++ b/sql/pgaudit.sql +@@ -1637,6 +1637,36 @@ DROP USER MAPPING FOR regress_user1 SERVER fdw_server; + DROP SERVER fdw_server; + DROP EXTENSION postgres_fdw; + ++-- ++-- Test logging in parallel workers ++SET pgaudit.log = 'read'; ++SET pgaudit.log_client = on; ++SET pgaudit.log_level = 'notice'; ++ ++-- Force parallel execution for testing ++SET max_parallel_workers_per_gather = 2; ++SET parallel_tuple_cost = 0; ++SET parallel_setup_cost = 0; ++SET min_parallel_table_scan_size = 0; ++SET min_parallel_index_scan_size = 0; ++ ++-- Create table with enough data to trigger parallel execution ++CREATE TABLE parallel_test (id int, data text); ++INSERT INTO parallel_test SELECT generate_series(1, 1000), 'test data'; ++ ++SELECT count(*) FROM parallel_test; ++ ++-- Cleanup parallel test ++DROP TABLE parallel_test; ++RESET max_parallel_workers_per_gather; ++RESET parallel_tuple_cost; ++RESET parallel_setup_cost; ++RESET min_parallel_table_scan_size; ++RESET min_parallel_index_scan_size; ++RESET pgaudit.log; ++RESET pgaudit.log_client; ++RESET pgaudit.log_level; ++ + -- Cleanup + -- Set client_min_messages up to warning to avoid noise + SET client_min_messages = 'warning'; From 45607cbe0c1e689db02abedf7027ff96182ccd73 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 3 Jul 2025 08:35:57 +0100 Subject: [PATCH 17/22] [local_proxy]: ignore TLS for endpoint (#12316) ## Problem When local proxy is configured with TLS, the certificate does not match the endpoint string. This currently returns an error. ## Summary of changes I don't think this code is necessary anymore, taking the prefix from the hostname is good enough (and is equivalent to what `endpoint_sni` was doing) and we ignore checking the domain suffix. --- proxy/src/serverless/sql_over_http.rs | 34 +++++++-------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 18ce03c725..5b348d59af 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -33,9 +33,9 @@ use super::conn_pool_lib::{self, ConnInfo}; use super::error::HttpCodeError; use super::http_util::json_response; use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; +use crate::auth::ComputeUserInfoParseError; use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::auth::{ComputeUserInfoParseError, endpoint_sni}; -use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; +use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; @@ -43,7 +43,7 @@ use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::pqproto::StartupMessageParams; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; -use crate::types::{DbName, RoleName}; +use crate::types::{DbName, EndpointId, RoleName}; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; use crate::util::run_until_cancelled; @@ -113,8 +113,6 @@ pub(crate) enum ConnInfoError { MissingHostname, #[error("invalid hostname: {0}")] InvalidEndpoint(#[from] ComputeUserInfoParseError), - #[error("malformed endpoint")] - MalformedEndpoint, } #[derive(Debug, thiserror::Error)] @@ -141,7 +139,6 @@ fn get_conn_info( config: &'static AuthenticationConfig, ctx: &RequestContext, headers: &HeaderMap, - tls: Option<&TlsConfig>, ) -> Result { let connection_string = headers .get(&CONN_STRING) @@ -199,17 +196,11 @@ fn get_conn_info( return Err(ConnInfoError::MissingCredentials(Credentials::Password)); }; - let endpoint = match connection_url.host() { - Some(url::Host::Domain(hostname)) => { - if let Some(tls) = tls { - endpoint_sni(hostname, &tls.common_names).ok_or(ConnInfoError::MalformedEndpoint)? - } else { - hostname - .split_once('.') - .map_or(hostname, |(prefix, _)| prefix) - .into() - } - } + let endpoint: EndpointId = match connection_url.host() { + Some(url::Host::Domain(hostname)) => hostname + .split_once('.') + .map_or(hostname, |(prefix, _)| prefix) + .into(), Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { return Err(ConnInfoError::MissingHostname); } @@ -670,14 +661,7 @@ async fn handle_inner( "handling interactive connection from client" ); - let conn_info = get_conn_info( - &config.authentication_config, - ctx, - request.headers(), - // todo: race condition? - // we're unlikely to change the common names. - config.tls_config.load().as_deref(), - )?; + let conn_info = get_conn_info(&config.authentication_config, ctx, request.headers())?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" From e01c8f238c2ce776b9d5eff8f62f65b0ba1fa19a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 3 Jul 2025 08:46:48 +0100 Subject: [PATCH 18/22] [proxy] update noisy error logging (#12438) Health checks for pg-sni-router open a TCP connection and immediately close it again. This is noisy. We will filter out any EOF errors on the first message. "acquired permit" debug log is incorrect since it logs when we timedout as well. This fixes the debug log. --- proxy/src/binary/pg_sni_router.rs | 19 +++++++++++++++++-- proxy/src/control_plane/client/mod.rs | 7 ++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index b877aaddef..4ac8b6a995 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -4,6 +4,7 @@ //! This allows connecting to pods/services running in the same Kubernetes cluster from //! the outside. Similar to an ingress controller for HTTPS. +use std::io; use std::net::SocketAddr; use std::path::Path; use std::sync::Arc; @@ -229,7 +230,6 @@ pub(super) async fn task_main( .set_nodelay(true) .context("failed to set socket option")?; - info!(%peer_addr, "serving"); let ctx = RequestContext::new( session_id, ConnectionInfo { @@ -241,6 +241,14 @@ pub(super) async fn task_main( handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await } .unwrap_or_else(|e| { + if let Some(FirstMessage(io_error)) = e.downcast_ref() { + // this is noisy. if we get EOF on the very first message that's likely + // just NLB doing a healthcheck. + if io_error.kind() == io::ErrorKind::UnexpectedEof { + return; + } + } + // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); }) @@ -257,12 +265,19 @@ pub(super) async fn task_main( Ok(()) } +#[derive(Debug, thiserror::Error)] +#[error(transparent)] +struct FirstMessage(io::Error); + async fn ssl_handshake( ctx: &RequestContext, raw_stream: S, tls_config: Arc, ) -> anyhow::Result> { - let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)).await?; + let (mut stream, msg) = PqStream::parse_startup(Stream::from_raw(raw_stream)) + .await + .map_err(FirstMessage)?; + match msg { FeStartupPacket::SslRequest { direct: None } => { let raw = stream.accept_tls().await?; diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index 4e5f5c7899..2ffc589df6 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -213,7 +213,12 @@ impl ApiLocks { self.metrics .semaphore_acquire_seconds .observe(now.elapsed().as_secs_f64()); - debug!("acquired permit {:?}", now.elapsed().as_secs_f64()); + + if permit.is_ok() { + debug!(elapsed = ?now.elapsed(), "acquired permit"); + } else { + debug!(elapsed = ?now.elapsed(), "timed out acquiring permit"); + } Ok(WakeComputePermit { permit: permit? }) } From 3415b90e8801f4fefed920c955fb91fe8bb5e4ec Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Thu, 3 Jul 2025 10:09:10 +0200 Subject: [PATCH 19/22] proxy/logging: Add "ep" and "query_id" to list of extracted fields (#12437) Extract two more interesting fields from spans: ep (endpoint) and query_id. Useful for reliable filtering in logging. --- proxy/src/logging.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a58b55a704..2e444164df 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -52,7 +52,7 @@ pub async fn init() -> anyhow::Result { StderrWriter { stderr: std::io::stderr(), }, - &["request_id", "session_id", "conn_id"], + &["conn_id", "ep", "query_id", "request_id", "session_id"], )) } else { None From e12d4f356a8ddf858eaa228e6e0ac86dd07d016c Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 3 Jul 2025 11:41:09 +0200 Subject: [PATCH 20/22] Work around Clap's incorrect usage of Display for default_value_t (#12454) ## Problem #12450 ## Summary of changes Instead of `#[arg(default_value_t = typed_default_value)]`, we use `#[arg(default_value = "str that deserializes into the value")]`, because apparently you can't convince clap to _not_ deserialize from the Display implementation of an imported enum. --- control_plane/src/bin/neon_local.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 3440d8979a..c75d76260a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -64,7 +64,9 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); +#[allow(dead_code)] const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; +const DEFAULT_PG_VERSION_NUM: &str = "17"; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; @@ -167,7 +169,7 @@ struct TenantCreateCmdArgs { #[clap(short = 'c')] config: Vec, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version to use for the initial timeline")] pg_version: PgMajorVersion, @@ -290,7 +292,7 @@ struct TimelineCreateCmdArgs { #[clap(long, help = "Human-readable alias for the new timeline")] branch_name: String, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version")] pg_version: PgMajorVersion, } @@ -322,7 +324,7 @@ struct TimelineImportCmdArgs { #[clap(long, help = "Lsn the basebackup ends at")] end_lsn: Option, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version of the backup being imported")] pg_version: PgMajorVersion, } @@ -601,7 +603,7 @@ struct EndpointCreateCmdArgs { )] config_only: bool, - #[arg(default_value_t = DEFAULT_PG_VERSION)] + #[arg(default_value = DEFAULT_PG_VERSION_NUM)] #[clap(long, help = "Postgres version")] pg_version: PgMajorVersion, From 1bc1eae5e8bb2abf1124cfa80ebf412024c897ec Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 3 Jul 2025 10:51:35 +0100 Subject: [PATCH 21/22] fix redis credentials check (#12455) ## Problem `keep_connection` does not exit, so it was never setting `credentials_refreshed`. ## Summary of changes Set `credentials_refreshed` to true when we first establish a connection, and after we re-authenticate the connection. --- .../connection_with_credentials_provider.rs | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 510701cb27..0465493799 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,3 +1,4 @@ +use std::convert::Infallible; use std::sync::{Arc, atomic::AtomicBool, atomic::Ordering}; use std::time::Duration; @@ -5,7 +6,7 @@ use futures::FutureExt; use redis::aio::{ConnectionLike, MultiplexedConnection}; use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; use tokio::task::JoinHandle; -use tracing::{debug, error, info, warn}; +use tracing::{error, info, warn}; use super::elasticache::CredentialsProvider; @@ -31,7 +32,7 @@ pub struct ConnectionWithCredentialsProvider { credentials: Credentials, // TODO: with more load on the connection, we should consider using a connection pool con: Option, - refresh_token_task: Option>, + refresh_token_task: Option>, mutex: tokio::sync::Mutex<()>, credentials_refreshed: Arc, } @@ -121,15 +122,11 @@ impl ConnectionWithCredentialsProvider { let credentials_provider = credentials_provider.clone(); let con2 = con.clone(); let credentials_refreshed = self.credentials_refreshed.clone(); - let f = tokio::spawn(async move { - let result = Self::keep_connection(con2, credentials_provider).await; - if let Err(e) = result { - credentials_refreshed.store(false, Ordering::Release); - debug!("keep_connection failed: {e}"); - } else { - credentials_refreshed.store(true, Ordering::Release); - } - }); + let f = tokio::spawn(Self::keep_connection( + con2, + credentials_provider, + credentials_refreshed, + )); self.refresh_token_task = Some(f); } match Self::ping(&mut con).await { @@ -165,6 +162,7 @@ impl ConnectionWithCredentialsProvider { async fn get_client(&self) -> anyhow::Result { let client = redis::Client::open(self.get_connection_info().await?)?; + self.credentials_refreshed.store(true, Ordering::Relaxed); Ok(client) } @@ -180,16 +178,19 @@ impl ConnectionWithCredentialsProvider { async fn keep_connection( mut con: MultiplexedConnection, credentials_provider: Arc, - ) -> anyhow::Result<()> { + credentials_refreshed: Arc, + ) -> Infallible { loop { // The connection lives for 12h, for the sanity check we refresh it every hour. tokio::time::sleep(Duration::from_secs(60 * 60)).await; match Self::refresh_token(&mut con, credentials_provider.clone()).await { Ok(()) => { info!("Token refreshed"); + credentials_refreshed.store(true, Ordering::Relaxed); } Err(e) => { error!("Error during token refresh: {e:?}"); + credentials_refreshed.store(false, Ordering::Relaxed); } } } From 95e1011cd6c6809e626a43eaabbf852f08cbebbd Mon Sep 17 00:00:00 2001 From: Ruslan Talpa Date: Thu, 3 Jul 2025 14:04:08 +0300 Subject: [PATCH 22/22] subzero pre-integration refactor (#12416) ## Problem integrating subzero requires a bit of refactoring. To make the integration PR a bit more manageable, the refactoring is done in this separate PR. ## Summary of changes * move common types/functions used in sql_over_http to errors.rs and http_util.rs * add the "Local" auth backend to proxy (similar to local_proxy), useful in local testing * change the Connect and Send type for the http client to allow for custom body when making post requests to local_proxy from the proxy --------- Co-authored-by: Ruslan Talpa --- proxy/README.md | 59 +++++ proxy/src/auth/backend/mod.rs | 1 - proxy/src/binary/local_proxy.rs | 154 ++----------- proxy/src/binary/proxy.rs | 59 ++++- proxy/src/compute/mod.rs | 2 +- proxy/src/config.rs | 142 ++++++++++++ proxy/src/serverless/backend.rs | 22 +- proxy/src/serverless/error.rs | 88 ++++++++ proxy/src/serverless/http_conn_pool.rs | 7 +- proxy/src/serverless/http_util.rs | 167 +++++++++++++- proxy/src/serverless/mod.rs | 2 +- proxy/src/serverless/sql_over_http.rs | 290 +++---------------------- 12 files changed, 581 insertions(+), 412 deletions(-) diff --git a/proxy/README.md b/proxy/README.md index 583db36f28..e10ff3d710 100644 --- a/proxy/README.md +++ b/proxy/README.md @@ -138,3 +138,62 @@ Now from client you can start a new session: ```sh PGSSLROOTCERT=./server.crt psql "postgresql://proxy:password@endpoint.local.neon.build:4432/postgres?sslmode=verify-full" ``` + +## auth broker setup: + +Create a postgres instance: +```sh +docker run \ + --detach \ + --name proxy-postgres \ + --env POSTGRES_HOST_AUTH_METHOD=trust \ + --env POSTGRES_USER=authenticated \ + --env POSTGRES_DB=database \ + --publish 5432:5432 \ + postgres:17-bookworm +``` + +Create a configuration file called `local_proxy.json` in the root of the repo (used also by the auth broker to validate JWTs) +```sh +{ + "jwks": [ + { + "id": "1", + "role_names": ["authenticator", "authenticated", "anon"], + "jwks_url": "https://climbing-minnow-11.clerk.accounts.dev/.well-known/jwks.json", + "provider_name": "foo", + "jwt_audience": null + } + ] +} +``` + +Start the local proxy: +```sh +cargo run --bin local_proxy -- \ + --disable_pg_session_jwt true \ + --http 0.0.0.0:7432 +``` + +Start the auth broker: +```sh +LOGFMT=text OTEL_SDK_DISABLED=true cargo run --bin proxy --features testing -- \ + -c server.crt -k server.key \ + --is-auth-broker true \ + --wss 0.0.0.0:8080 \ + --http 0.0.0.0:7002 \ + --auth-backend local +``` + +Create a JWT in your auth provider (e.g. Clerk) and set it in the `NEON_JWT` environment variable. +```sh +export NEON_JWT="..." +``` + +Run a query against the auth broker: +```sh +curl -k "https://foo.local.neon.build:8080/sql" \ + -H "Authorization: Bearer $NEON_JWT" \ + -H "neon-connection-string: postgresql://authenticator@foo.local.neon.build/database" \ + -d '{"query":"select 1","params":[]}' +``` diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 2e3013ead0..8fc3ea1978 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -171,7 +171,6 @@ impl ComputeUserInfo { pub(crate) enum ComputeCredentialKeys { AuthKeys(AuthKeys), JwtPayload(Vec), - None, } impl TryFrom for ComputeUserInfo { diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index 423ecf821e..04cc7b3907 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -1,43 +1,39 @@ use std::net::SocketAddr; use std::pin::pin; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, bail, ensure}; +use anyhow::bail; use arc_swap::ArcSwapOption; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8PathBuf; use clap::Parser; -use compute_api::spec::LocalProxySpec; + use futures::future::Either; -use thiserror::Error; + use tokio::net::TcpListener; use tokio::sync::Notify; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, info}; use utils::sentry_init::init_sentry; use utils::{pid_file, project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; -use crate::auth::backend::local::{JWKS_ROLE_MAP, LocalBackend}; +use crate::auth::backend::local::LocalBackend; use crate::auth::{self}; use crate::cancellation::CancellationHandler; +use crate::config::refresh_config_loop; use crate::config::{ self, AuthenticationConfig, ComputeConfig, HttpConfig, ProxyConfig, RetryConfig, }; use crate::control_plane::locks::ApiLocks; -use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; -use crate::ext::TaskExt; use crate::http::health_server::AppMetrics; -use crate::intern::RoleNameInt; use crate::metrics::{Metrics, ThreadPoolMetrics}; use crate::rate_limiter::{EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}; use crate::scram::threadpool::ThreadPool; use crate::serverless::cancel_set::CancelSet; use crate::serverless::{self, GlobalConnPoolOptions}; use crate::tls::client_config::compute_client_config_with_root_certs; -use crate::types::RoleName; use crate::url::ApiUrl; project_git_version!(GIT_VERSION); @@ -82,6 +78,11 @@ struct LocalProxyCliArgs { /// Path of the local proxy PID file #[clap(long, default_value = "./local_proxy.pid")] pid_path: Utf8PathBuf, + /// Disable pg_session_jwt extension installation + /// This is useful for testing the local proxy with vanilla postgres. + #[clap(long, default_value = "false")] + #[cfg(feature = "testing")] + disable_pg_session_jwt: bool, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -282,6 +283,8 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, + #[cfg(feature = "testing")] + disable_pg_session_jwt: args.disable_pg_session_jwt, }))) } @@ -293,132 +296,3 @@ fn build_auth_backend(args: &LocalProxyCliArgs) -> &'static auth::Backend<'stati Box::leak(Box::new(auth_backend)) } - -#[derive(Error, Debug)] -enum RefreshConfigError { - #[error(transparent)] - Read(#[from] std::io::Error), - #[error(transparent)] - Parse(#[from] serde_json::Error), - #[error(transparent)] - Validate(anyhow::Error), - #[error(transparent)] - Tls(anyhow::Error), -} - -async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { - let mut init = true; - loop { - rx.notified().await; - - match refresh_config_inner(config, &path).await { - Ok(()) => {} - // don't log for file not found errors if this is the first time we are checking - // for computes that don't use local_proxy, this is not an error. - Err(RefreshConfigError::Read(e)) - if init && e.kind() == std::io::ErrorKind::NotFound => - { - debug!(error=?e, ?path, "could not read config file"); - } - Err(RefreshConfigError::Tls(e)) => { - error!(error=?e, ?path, "could not read TLS certificates"); - } - Err(e) => { - error!(error=?e, ?path, "could not read config file"); - } - } - - init = false; - } -} - -async fn refresh_config_inner( - config: &ProxyConfig, - path: &Utf8Path, -) -> Result<(), RefreshConfigError> { - let bytes = tokio::fs::read(&path).await?; - let data: LocalProxySpec = serde_json::from_slice(&bytes)?; - - let mut jwks_set = vec![]; - - fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { - let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; - - ensure!( - jwks_url.has_authority() - && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), - "Invalid JWKS url. Must be HTTP", - ); - - ensure!( - jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), - "Invalid JWKS url. No domain listed", - ); - - // clear username, password and ports - jwks_url - .set_username("") - .expect("url can be a base and has a valid host and is not a file. should not error"); - jwks_url - .set_password(None) - .expect("url can be a base and has a valid host and is not a file. should not error"); - // local testing is hard if we need to have a specific restricted port - if cfg!(not(feature = "testing")) { - jwks_url.set_port(None).expect( - "url can be a base and has a valid host and is not a file. should not error", - ); - } - - // clear query params - jwks_url.set_fragment(None); - jwks_url.query_pairs_mut().clear().finish(); - - if jwks_url.scheme() != "https" { - // local testing is hard if we need to set up https support. - if cfg!(not(feature = "testing")) { - jwks_url - .set_scheme("https") - .expect("should not error to set the scheme to https if it was http"); - } else { - warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); - } - } - - Ok(JwksSettings { - id: jwks.id, - jwks_url, - _provider_name: jwks.provider_name, - jwt_audience: jwks.jwt_audience, - role_names: jwks - .role_names - .into_iter() - .map(RoleName::from) - .map(|s| RoleNameInt::from(&s)) - .collect(), - }) - } - - for jwks in data.jwks.into_iter().flatten() { - jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); - } - - info!("successfully loaded new config"); - JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); - - if let Some(tls_config) = data.tls { - let tls_config = tokio::task::spawn_blocking(move || { - crate::tls::server_config::configure_tls( - tls_config.key_path.as_ref(), - tls_config.cert_path.as_ref(), - None, - false, - ) - }) - .await - .propagate_task_panic() - .map_err(RefreshConfigError::Tls)?; - config.tls_config.store(Some(Arc::new(tls_config))); - } - - Ok(()) -} diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 2133f33a4d..7522dd5162 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -22,9 +22,13 @@ use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version}; use crate::auth::backend::jwt::JwkCache; +#[cfg(any(test, feature = "testing"))] +use crate::auth::backend::local::LocalBackend; use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned}; use crate::batch::BatchQueue; use crate::cancellation::{CancellationHandler, CancellationProcessor}; +#[cfg(any(test, feature = "testing"))] +use crate::config::refresh_config_loop; use crate::config::{ self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, remote_storage_from_toml, @@ -43,6 +47,10 @@ use crate::tls::client_config::compute_client_config_with_root_certs; #[cfg(any(test, feature = "testing"))] use crate::url::ApiUrl; use crate::{auth, control_plane, http, serverless, usage_metrics}; +#[cfg(any(test, feature = "testing"))] +use camino::Utf8PathBuf; +#[cfg(any(test, feature = "testing"))] +use tokio::sync::Notify; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); @@ -60,6 +68,9 @@ enum AuthBackendType { #[cfg(any(test, feature = "testing"))] Postgres, + + #[cfg(any(test, feature = "testing"))] + Local, } /// Neon proxy/router @@ -74,6 +85,10 @@ struct ProxyCliArgs { proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, + /// Path of the local proxy config file (used for local-file auth backend) + #[clap(long, default_value = "./local_proxy.json")] + #[cfg(any(test, feature = "testing"))] + config_path: Utf8PathBuf, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] mgmt: SocketAddr, @@ -226,6 +241,14 @@ struct ProxyCliArgs { #[clap(flatten)] pg_sni_router: PgSniRouterArgs, + + /// if this is not local proxy, this toggles whether we accept Postgres REST requests + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_rest_broker: bool, + + /// cache for `db_schema_cache` introspection (use `size=0` to disable) + #[clap(long, default_value = "size=1000,ttl=1h")] + db_schema_cache: String, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -386,6 +409,8 @@ pub async fn run() -> anyhow::Result<()> { 64, )); + #[cfg(any(test, feature = "testing"))] + let refresh_config_notify = Arc::new(Notify::new()); // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); @@ -412,6 +437,17 @@ pub async fn run() -> anyhow::Result<()> { endpoint_rate_limiter.clone(), )); } + + // if auth backend is local, we need to load the config file + #[cfg(any(test, feature = "testing"))] + if let auth::Backend::Local(_) = &auth_backend { + refresh_config_notify.notify_one(); + tokio::spawn(refresh_config_loop( + config, + args.config_path, + refresh_config_notify.clone(), + )); + } } Either::Right(auth_backend) => { if let Some(proxy_listener) = proxy_listener { @@ -462,7 +498,13 @@ pub async fn run() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), || {})); + + maintenance_tasks.spawn(crate::signals::handle(cancellation_token.clone(), { + move || { + #[cfg(any(test, feature = "testing"))] + refresh_config_notify.notify_one(); + } + })); maintenance_tasks.spawn(http::health_server::task_main( http_listener, AppMetrics { @@ -653,6 +695,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, + #[cfg(feature = "testing")] + disable_pg_session_jwt: false, }; let config = Box::leak(Box::new(config)); @@ -806,6 +850,19 @@ fn build_auth_backend( Ok(Either::Right(config)) } + + #[cfg(any(test, feature = "testing"))] + AuthBackendType::Local => { + let postgres: SocketAddr = "127.0.0.1:7432".parse()?; + let compute_ctl: ApiUrl = "http://127.0.0.1:3081/".parse()?; + let auth_backend = crate::auth::Backend::Local( + crate::auth::backend::MaybeOwned::Owned(LocalBackend::new(postgres, compute_ctl)), + ); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } } } diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 0a19090ce0..7b9183b05e 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -165,7 +165,7 @@ impl AuthInfo { ComputeCredentialKeys::AuthKeys(AuthKeys::ScramSha256(auth_keys)) => { Some(Auth::Scram(Box::new(auth_keys))) } - ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => None, + ComputeCredentialKeys::JwtPayload(_) => None, }, server_params: StartupMessageParams::default(), skip_db_user: false, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index cee15ac7fa..d5e6e1e4cb 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -16,6 +16,17 @@ use crate::serverless::cancel_set::CancelSet; pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; +use crate::auth::backend::local::JWKS_ROLE_MAP; +use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::ext::TaskExt; +use crate::intern::RoleNameInt; +use crate::types::RoleName; +use camino::{Utf8Path, Utf8PathBuf}; +use compute_api::spec::LocalProxySpec; +use thiserror::Error; +use tokio::sync::Notify; +use tracing::{debug, error, info, warn}; + pub struct ProxyConfig { pub tls_config: ArcSwapOption, pub metric_collection: Option, @@ -26,6 +37,8 @@ pub struct ProxyConfig { pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, pub connect_to_compute: ComputeConfig, + #[cfg(feature = "testing")] + pub disable_pg_session_jwt: bool, } pub struct ComputeConfig { @@ -409,6 +422,135 @@ impl FromStr for ConcurrencyLockOptions { } } +#[derive(Error, Debug)] +pub(crate) enum RefreshConfigError { + #[error(transparent)] + Read(#[from] std::io::Error), + #[error(transparent)] + Parse(#[from] serde_json::Error), + #[error(transparent)] + Validate(anyhow::Error), + #[error(transparent)] + Tls(anyhow::Error), +} + +pub(crate) async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { + let mut init = true; + loop { + rx.notified().await; + + match refresh_config_inner(config, &path).await { + std::result::Result::Ok(()) => {} + // don't log for file not found errors if this is the first time we are checking + // for computes that don't use local_proxy, this is not an error. + Err(RefreshConfigError::Read(e)) + if init && e.kind() == std::io::ErrorKind::NotFound => + { + debug!(error=?e, ?path, "could not read config file"); + } + Err(RefreshConfigError::Tls(e)) => { + error!(error=?e, ?path, "could not read TLS certificates"); + } + Err(e) => { + error!(error=?e, ?path, "could not read config file"); + } + } + + init = false; + } +} + +pub(crate) async fn refresh_config_inner( + config: &ProxyConfig, + path: &Utf8Path, +) -> Result<(), RefreshConfigError> { + let bytes = tokio::fs::read(&path).await?; + let data: LocalProxySpec = serde_json::from_slice(&bytes)?; + + let mut jwks_set = vec![]; + + fn parse_jwks_settings(jwks: compute_api::spec::JwksSettings) -> anyhow::Result { + let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; + + ensure!( + jwks_url.has_authority() + && (jwks_url.scheme() == "http" || jwks_url.scheme() == "https"), + "Invalid JWKS url. Must be HTTP", + ); + + ensure!( + jwks_url.host().is_some_and(|h| h != url::Host::Domain("")), + "Invalid JWKS url. No domain listed", + ); + + // clear username, password and ports + jwks_url + .set_username("") + .expect("url can be a base and has a valid host and is not a file. should not error"); + jwks_url + .set_password(None) + .expect("url can be a base and has a valid host and is not a file. should not error"); + // local testing is hard if we need to have a specific restricted port + if cfg!(not(feature = "testing")) { + jwks_url.set_port(None).expect( + "url can be a base and has a valid host and is not a file. should not error", + ); + } + + // clear query params + jwks_url.set_fragment(None); + jwks_url.query_pairs_mut().clear().finish(); + + if jwks_url.scheme() != "https" { + // local testing is hard if we need to set up https support. + if cfg!(not(feature = "testing")) { + jwks_url + .set_scheme("https") + .expect("should not error to set the scheme to https if it was http"); + } else { + warn!(scheme = jwks_url.scheme(), "JWKS url is not HTTPS"); + } + } + + Ok(JwksSettings { + id: jwks.id, + jwks_url, + _provider_name: jwks.provider_name, + jwt_audience: jwks.jwt_audience, + role_names: jwks + .role_names + .into_iter() + .map(RoleName::from) + .map(|s| RoleNameInt::from(&s)) + .collect(), + }) + } + + for jwks in data.jwks.into_iter().flatten() { + jwks_set.push(parse_jwks_settings(jwks).map_err(RefreshConfigError::Validate)?); + } + + info!("successfully loaded new config"); + JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + + if let Some(tls_config) = data.tls { + let tls_config = tokio::task::spawn_blocking(move || { + crate::tls::server_config::configure_tls( + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), + None, + false, + ) + }) + .await + .propagate_task_panic() + .map_err(RefreshConfigError::Tls)?; + config.tls_config.store(Some(Arc::new(tls_config))); + } + + std::result::Result::Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 7708342ae3..4b3f379e76 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -115,7 +115,8 @@ impl PoolingBackend { match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { - self.config + let keys = self + .config .authentication_config .jwks_cache .check_jwt( @@ -129,7 +130,7 @@ impl PoolingBackend { Ok(ComputeCredentials { info: user_info.clone(), - keys: crate::auth::backend::ComputeCredentialKeys::None, + keys, }) } crate::auth::Backend::Local(_) => { @@ -256,6 +257,7 @@ impl PoolingBackend { &self, ctx: &RequestContext, conn_info: ConnInfo, + disable_pg_session_jwt: bool, ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { return Ok(client); @@ -277,7 +279,7 @@ impl PoolingBackend { .expect("semaphore should never be closed"); // check again for race - if !self.local_pool.initialized(&conn_info) { + if !self.local_pool.initialized(&conn_info) && !disable_pg_session_jwt { local_backend .compute_ctl .install_extension(&ExtensionInstallRequest { @@ -313,14 +315,16 @@ impl PoolingBackend { .to_postgres_client_config(); config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname) - .set_param( + .dbname(&conn_info.dbname); + if !disable_pg_session_jwt { + config.set_param( "options", &format!( "-c pg_session_jwt.jwk={}", serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") ), ); + } let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(&postgres_client::NoTls).await?; @@ -345,9 +349,11 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.batch_execute("select auth.init();").await { - discard.discard(); - return Err(e.into()); + if !disable_pg_session_jwt { + if let Err(e) = client.batch_execute("select auth.init();").await { + discard.discard(); + return Err(e.into()); + } } info!("backend session state initialized"); diff --git a/proxy/src/serverless/error.rs b/proxy/src/serverless/error.rs index 323c91baa5..786964e764 100644 --- a/proxy/src/serverless/error.rs +++ b/proxy/src/serverless/error.rs @@ -1,5 +1,93 @@ use http::StatusCode; +use http::header::HeaderName; + +use crate::auth::ComputeUserInfoParseError; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::http::ReadBodyError; pub trait HttpCodeError { fn get_http_status_code(&self) -> StatusCode; } + +#[derive(Debug, thiserror::Error)] +pub(crate) enum ConnInfoError { + #[error("invalid header: {0}")] + InvalidHeader(&'static HeaderName), + #[error("invalid connection string: {0}")] + UrlParseError(#[from] url::ParseError), + #[error("incorrect scheme")] + IncorrectScheme, + #[error("missing database name")] + MissingDbName, + #[error("invalid database name")] + InvalidDbName, + #[error("missing username")] + MissingUsername, + #[error("invalid username: {0}")] + InvalidUsername(#[from] std::string::FromUtf8Error), + #[error("missing authentication credentials: {0}")] + MissingCredentials(Credentials), + #[error("missing hostname")] + MissingHostname, + #[error("invalid hostname: {0}")] + InvalidEndpoint(#[from] ComputeUserInfoParseError), +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum Credentials { + #[error("required password")] + Password, + #[error("required authorization bearer token in JWT format")] + BearerJwt, +} + +impl ReportableError for ConnInfoError { + fn get_error_kind(&self) -> ErrorKind { + ErrorKind::User + } +} + +impl UserFacingError for ConnInfoError { + fn to_string_client(&self) -> String { + self.to_string() + } +} + +#[derive(Debug, thiserror::Error)] +pub(crate) enum ReadPayloadError { + #[error("could not read the HTTP request body: {0}")] + Read(#[from] hyper::Error), + #[error("request is too large (max is {limit} bytes)")] + BodyTooLarge { limit: usize }, + #[error("could not parse the HTTP request body: {0}")] + Parse(#[from] serde_json::Error), +} + +impl From> for ReadPayloadError { + fn from(value: ReadBodyError) -> Self { + match value { + ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, + ReadBodyError::Read(e) => Self::Read(e), + } + } +} + +impl ReportableError for ReadPayloadError { + fn get_error_kind(&self) -> ErrorKind { + match self { + ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, + ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, + ReadPayloadError::Parse(_) => ErrorKind::User, + } + } +} + +impl HttpCodeError for ReadPayloadError { + fn get_http_status_code(&self) -> StatusCode { + match self { + ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, + ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, + ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, + } + } +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 1c6574e57e..18f7ecc0b1 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -20,9 +20,12 @@ use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use bytes::Bytes; +use http_body_util::combinators::BoxBody; -pub(crate) type Send = http2::SendRequest; -pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; +pub(crate) type Send = http2::SendRequest>; +pub(crate) type Connect = + http2::Connection, BoxBody, TokioExecutor>; #[derive(Clone)] pub(crate) struct ClientDataHttp(); diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 95a28663a5..c876d8f096 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -3,11 +3,43 @@ use anyhow::Context; use bytes::Bytes; -use http::{Response, StatusCode}; +use http::header::AUTHORIZATION; +use http::{HeaderMap, HeaderName, HeaderValue, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Full}; use http_utils::error::ApiError; use serde::Serialize; +use url::Url; +use uuid::Uuid; + +use super::conn_pool::AuthData; +use super::conn_pool::ConnInfoWithAuth; +use super::conn_pool_lib::ConnInfo; +use super::error::{ConnInfoError, Credentials}; +use crate::auth::backend::ComputeUserInfo; +use crate::config::AuthenticationConfig; +use crate::context::RequestContext; +use crate::metrics::{Metrics, SniGroup, SniKind}; +use crate::pqproto::StartupMessageParams; +use crate::proxy::NeonOptions; +use crate::types::{DbName, EndpointId, RoleName}; + +// Common header names used across serverless modules +pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id"); +pub(super) static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); +pub(super) static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); +pub(super) static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); +pub(super) static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); +pub(super) static TXN_ISOLATION_LEVEL: HeaderName = + HeaderName::from_static("neon-batch-isolation-level"); +pub(super) static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); +pub(super) static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable"); + +pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { + let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH]; + HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..])) + .expect("uuid hyphenated format should be all valid header characters") +} /// Like [`ApiError::into_response`] pub(crate) fn api_error_into_response(this: ApiError) -> Response> { @@ -107,3 +139,136 @@ pub(crate) fn json_response( .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } + +pub(crate) fn get_conn_info( + config: &'static AuthenticationConfig, + ctx: &RequestContext, + connection_string: Option<&str>, + headers: &HeaderMap, +) -> Result { + let connection_url = match connection_string { + Some(connection_string) => Url::parse(connection_string)?, + None => { + let connection_string = headers + .get(&CONN_STRING) + .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; + Url::parse(connection_string)? + } + }; + + let protocol = connection_url.scheme(); + if protocol != "postgres" && protocol != "postgresql" { + return Err(ConnInfoError::IncorrectScheme); + } + + let mut url_path = connection_url + .path_segments() + .ok_or(ConnInfoError::MissingDbName)?; + + let dbname: DbName = + urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); + ctx.set_dbname(dbname.clone()); + + let username = RoleName::from(urlencoding::decode(connection_url.username())?); + if username.is_empty() { + return Err(ConnInfoError::MissingUsername); + } + ctx.set_user(username.clone()); + // TODO: make sure this is right in the context of rest broker + let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + if !config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + } + + let auth = auth + .to_str() + .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; + AuthData::Jwt( + auth.strip_prefix("Bearer ") + .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? + .into(), + ) + } else if let Some(pass) = connection_url.password() { + // wrong credentials provided + if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } + + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { + std::borrow::Cow::Borrowed(b) => b.into(), + std::borrow::Cow::Owned(b) => b.into(), + }) + } else if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } else { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + }; + let endpoint: EndpointId = match connection_url.host() { + Some(url::Host::Domain(hostname)) => hostname + .split_once('.') + .map_or(hostname, |(prefix, _)| prefix) + .into(), + Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { + return Err(ConnInfoError::MissingHostname); + } + }; + ctx.set_endpoint_id(endpoint.clone()); + + let pairs = connection_url.query_pairs(); + + let mut options = Option::None; + + let mut params = StartupMessageParams::default(); + params.insert("user", &username); + params.insert("database", &dbname); + for (key, value) in pairs { + params.insert(&key, &value); + if key == "options" { + options = Some(NeonOptions::parse_options_raw(&value)); + } + } + + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + + ctx.set_user_agent( + headers + .get(hyper::header::USER_AGENT) + .and_then(|h| h.to_str().ok()) + .map(Into::into), + ); + + let user_info = ComputeUserInfo { + endpoint, + user: username, + options: options.unwrap_or_default(), + }; + + let conn_info = ConnInfo { user_info, dbname }; + Ok(ConnInfoWithAuth { conn_info, auth }) +} diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index d8942bb814..5b7289c53d 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -29,13 +29,13 @@ use futures::future::{Either, select}; use http::{Method, Response, StatusCode}; use http_body_util::combinators::BoxBody; use http_body_util::{BodyExt, Empty}; +use http_util::{NEON_REQUEST_ID, uuid_to_header_value}; use http_utils::error::ApiError; use hyper::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; use rand::SeedableRng; use rand::rngs::StdRng; -use sql_over_http::{NEON_REQUEST_ID, uuid_to_header_value}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 5b348d59af..a901a47746 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -1,49 +1,45 @@ -use std::pin::pin; -use std::sync::Arc; - use bytes::Bytes; use futures::future::{Either, select, try_join}; use futures::{StreamExt, TryFutureExt}; -use http::Method; -use http::header::AUTHORIZATION; -use http_body_util::combinators::BoxBody; -use http_body_util::{BodyExt, Full}; +use http::{Method, header::AUTHORIZATION}; +use http_body_util::{BodyExt, Full, combinators::BoxBody}; use http_utils::error::ApiError; use hyper::body::Incoming; -use hyper::http::{HeaderName, HeaderValue}; -use hyper::{HeaderMap, Request, Response, StatusCode, header}; +use hyper::{ + Request, Response, StatusCode, header, + http::{HeaderName, HeaderValue}, +}; use indexmap::IndexMap; use postgres_client::error::{DbError, ErrorPosition, SqlState}; use postgres_client::{ GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction, }; use serde::Serialize; -use serde_json::Value; -use serde_json::value::RawValue; +use serde_json::{Value, value::RawValue}; +use std::pin::pin; +use std::sync::Arc; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; use tracing::{Level, debug, error, info}; use typed_json::json; -use url::Url; -use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; -use super::conn_pool::{AuthData, ConnInfoWithAuth}; +use super::conn_pool::AuthData; use super::conn_pool_lib::{self, ConnInfo}; -use super::error::HttpCodeError; -use super::http_util::json_response; +use super::error::{ConnInfoError, HttpCodeError, ReadPayloadError}; +use super::http_util::{ + ALLOW_POOL, ARRAY_MODE, CONN_STRING, NEON_REQUEST_ID, RAW_TEXT_OUTPUT, TXN_DEFERRABLE, + TXN_ISOLATION_LEVEL, TXN_READ_ONLY, get_conn_info, json_response, uuid_to_header_value, +}; use super::json::{JsonConversionError, json_to_pg_text, pg_text_row_to_json}; -use crate::auth::ComputeUserInfoParseError; -use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; -use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig}; +use crate::auth::backend::ComputeCredentialKeys; + +use crate::config::{HttpConfig, ProxyConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; -use crate::pqproto::StartupMessageParams; -use crate::proxy::NeonOptions; +use crate::http::read_body_with_limit; +use crate::metrics::{HttpDirection, Metrics}; use crate::serverless::backend::HttpConnError; -use crate::types::{DbName, EndpointId, RoleName}; use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; use crate::util::run_until_cancelled; @@ -70,16 +66,6 @@ enum Payload { Batch(BatchQueryData), } -pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id"); - -static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string"); -static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); -static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); -static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); -static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level"); -static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); -static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable"); - static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); fn bytes_to_pg_text<'de, D>(deserializer: D) -> Result>, D::Error> @@ -91,179 +77,6 @@ where Ok(json_to_pg_text(json)) } -#[derive(Debug, thiserror::Error)] -pub(crate) enum ConnInfoError { - #[error("invalid header: {0}")] - InvalidHeader(&'static HeaderName), - #[error("invalid connection string: {0}")] - UrlParseError(#[from] url::ParseError), - #[error("incorrect scheme")] - IncorrectScheme, - #[error("missing database name")] - MissingDbName, - #[error("invalid database name")] - InvalidDbName, - #[error("missing username")] - MissingUsername, - #[error("invalid username: {0}")] - InvalidUsername(#[from] std::string::FromUtf8Error), - #[error("missing authentication credentials: {0}")] - MissingCredentials(Credentials), - #[error("missing hostname")] - MissingHostname, - #[error("invalid hostname: {0}")] - InvalidEndpoint(#[from] ComputeUserInfoParseError), -} - -#[derive(Debug, thiserror::Error)] -pub(crate) enum Credentials { - #[error("required password")] - Password, - #[error("required authorization bearer token in JWT format")] - BearerJwt, -} - -impl ReportableError for ConnInfoError { - fn get_error_kind(&self) -> ErrorKind { - ErrorKind::User - } -} - -impl UserFacingError for ConnInfoError { - fn to_string_client(&self) -> String { - self.to_string() - } -} - -fn get_conn_info( - config: &'static AuthenticationConfig, - ctx: &RequestContext, - headers: &HeaderMap, -) -> Result { - let connection_string = headers - .get(&CONN_STRING) - .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? - .to_str() - .map_err(|_| ConnInfoError::InvalidHeader(&CONN_STRING))?; - - let connection_url = Url::parse(connection_string)?; - - let protocol = connection_url.scheme(); - if protocol != "postgres" && protocol != "postgresql" { - return Err(ConnInfoError::IncorrectScheme); - } - - let mut url_path = connection_url - .path_segments() - .ok_or(ConnInfoError::MissingDbName)?; - - let dbname: DbName = - urlencoding::decode(url_path.next().ok_or(ConnInfoError::InvalidDbName)?)?.into(); - ctx.set_dbname(dbname.clone()); - - let username = RoleName::from(urlencoding::decode(connection_url.username())?); - if username.is_empty() { - return Err(ConnInfoError::MissingUsername); - } - ctx.set_user(username.clone()); - - let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { - if !config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::Password)); - } - - let auth = auth - .to_str() - .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; - AuthData::Jwt( - auth.strip_prefix("Bearer ") - .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? - .into(), - ) - } else if let Some(pass) = connection_url.password() { - // wrong credentials provided - if config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); - } - - AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { - std::borrow::Cow::Borrowed(b) => b.into(), - std::borrow::Cow::Owned(b) => b.into(), - }) - } else if config.accept_jwts { - return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); - } else { - return Err(ConnInfoError::MissingCredentials(Credentials::Password)); - }; - - let endpoint: EndpointId = match connection_url.host() { - Some(url::Host::Domain(hostname)) => hostname - .split_once('.') - .map_or(hostname, |(prefix, _)| prefix) - .into(), - Some(url::Host::Ipv4(_) | url::Host::Ipv6(_)) | None => { - return Err(ConnInfoError::MissingHostname); - } - }; - ctx.set_endpoint_id(endpoint.clone()); - - let pairs = connection_url.query_pairs(); - - let mut options = Option::None; - - let mut params = StartupMessageParams::default(); - params.insert("user", &username); - params.insert("database", &dbname); - for (key, value) in pairs { - params.insert(&key, &value); - if key == "options" { - options = Some(NeonOptions::parse_options_raw(&value)); - } - } - - // check the URL that was used, for metrics - { - let host_endpoint = headers - // get the host header - .get("host") - // extract the domain - .and_then(|h| { - let (host, _port) = h.to_str().ok()?.split_once(':')?; - Some(host) - }) - // get the endpoint prefix - .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); - - let kind = if host_endpoint == Some(&*endpoint) { - SniKind::Sni - } else { - SniKind::NoSni - }; - - let protocol = ctx.protocol(); - Metrics::get() - .proxy - .accepted_connections_by_sni - .inc(SniGroup { protocol, kind }); - } - - ctx.set_user_agent( - headers - .get(hyper::header::USER_AGENT) - .and_then(|h| h.to_str().ok()) - .map(Into::into), - ); - - let user_info = ComputeUserInfo { - endpoint, - user: username, - options: options.unwrap_or_default(), - }; - - let conn_info = ConnInfo { user_info, dbname }; - Ok(ConnInfoWithAuth { conn_info, auth }) -} - pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestContext, @@ -532,45 +345,6 @@ impl HttpCodeError for SqlOverHttpError { } } -#[derive(Debug, thiserror::Error)] -pub(crate) enum ReadPayloadError { - #[error("could not read the HTTP request body: {0}")] - Read(#[from] hyper::Error), - #[error("request is too large (max is {limit} bytes)")] - BodyTooLarge { limit: usize }, - #[error("could not parse the HTTP request body: {0}")] - Parse(#[from] serde_json::Error), -} - -impl From> for ReadPayloadError { - fn from(value: ReadBodyError) -> Self { - match value { - ReadBodyError::BodyTooLarge { limit } => Self::BodyTooLarge { limit }, - ReadBodyError::Read(e) => Self::Read(e), - } - } -} - -impl ReportableError for ReadPayloadError { - fn get_error_kind(&self) -> ErrorKind { - match self { - ReadPayloadError::Read(_) => ErrorKind::ClientDisconnect, - ReadPayloadError::BodyTooLarge { .. } => ErrorKind::User, - ReadPayloadError::Parse(_) => ErrorKind::User, - } - } -} - -impl HttpCodeError for ReadPayloadError { - fn get_http_status_code(&self) -> StatusCode { - match self { - ReadPayloadError::Read(_) => StatusCode::BAD_REQUEST, - ReadPayloadError::BodyTooLarge { .. } => StatusCode::PAYLOAD_TOO_LARGE, - ReadPayloadError::Parse(_) => StatusCode::BAD_REQUEST, - } - } -} - #[derive(Debug, thiserror::Error)] pub(crate) enum SqlOverHttpCancel { #[error("query was cancelled")] @@ -661,7 +435,7 @@ async fn handle_inner( "handling interactive connection from client" ); - let conn_info = get_conn_info(&config.authentication_config, ctx, request.headers())?; + let conn_info = get_conn_info(&config.authentication_config, ctx, None, request.headers())?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" @@ -747,9 +521,17 @@ async fn handle_db_inner( ComputeCredentialKeys::JwtPayload(payload) if backend.auth_backend.is_local_proxy() => { - let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?; - let (cli_inner, _dsc) = client.client_inner(); - cli_inner.set_jwt_session(&payload).await?; + #[cfg(feature = "testing")] + let disable_pg_session_jwt = config.disable_pg_session_jwt; + #[cfg(not(feature = "testing"))] + let disable_pg_session_jwt = false; + let mut client = backend + .connect_to_local_postgres(ctx, conn_info, disable_pg_session_jwt) + .await?; + if !disable_pg_session_jwt { + let (cli_inner, _dsc) = client.client_inner(); + cli_inner.set_jwt_session(&payload).await?; + } Client::Local(client) } _ => { @@ -848,12 +630,6 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[ &TXN_DEFERRABLE, ]; -pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue { - let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH]; - HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..])) - .expect("uuid hyphenated format should be all valid header characters") -} - async fn handle_auth_broker_inner( ctx: &RequestContext, request: Request, @@ -883,7 +659,7 @@ async fn handle_auth_broker_inner( req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id())); let req = req - .body(body) + .body(body.map_err(|e| e).boxed()) //TODO: is there a potential for a regression here? .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress