From a963aab14b80062acdfac54af29f876b0f044552 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 3 Jun 2025 15:57:36 +0100 Subject: [PATCH 1/4] pagserver: set default wal receiver proto to interpreted (#12100) ## Problem This is already the default in production and in our test suite. ## Summary of changes Set the default proto to interpreted to reduce friction when spinning up new regions or cells. --- libs/pageserver_api/src/config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7ca623f8e5..237833b9de 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -608,7 +608,10 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = - utils::postgres_client::PostgresClientProtocol::Vanilla; + utils::postgres_client::PostgresClientProtocol::Interpreted { + format: utils::postgres_client::InterpretedFormat::Protobuf, + compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }), + }; pub const DEFAULT_SSL_KEY_FILE: &str = "server.key"; pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt"; From 4a3f32bf4a2865db28aa8d99e80e4766169e0edf Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 3 Jun 2025 11:00:56 -0500 Subject: [PATCH 2/4] Clean up compute_tools::http::JsonResponse::invalid_status() (#12110) JsonResponse::error() properly logs an error message which can be read in the compute logs. invalid_status() was not going through that helper function, thus not logging anything. Signed-off-by: Tristan Partin --- compute_tools/src/http/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index 9ecc1b0093..9b01def966 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -48,11 +48,9 @@ impl JsonResponse { /// Create an error response related to the compute being in an invalid state pub(self) fn invalid_status(status: ComputeStatus) -> Response { - Self::create_response( + Self::error( StatusCode::PRECONDITION_FAILED, - &GenericAPIError { - error: format!("invalid compute status: {status}"), - }, + format!("invalid compute status: {status}"), ) } } From c698cee19ac8378fe536d65140dd24009e65ab35 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 4 Jun 2025 06:38:03 +0100 Subject: [PATCH 3/4] ComputeSpec: prewarm_lfc_on_startup -> autoprewarm (#12120) https://github.com/neondatabase/cloud/pull/29472 https://github.com/neondatabase/cloud/issues/26346 --- compute_tools/src/compute.rs | 10 +++++----- compute_tools/tests/pg_helpers_tests.rs | 2 +- control_plane/src/endpoint.rs | 2 +- libs/compute_api/src/spec.rs | 4 ++-- libs/compute_api/tests/cluster_spec.json | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index d678b7d670..1ed71180d0 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -396,7 +396,7 @@ impl ComputeNode { // because QEMU will already have its memory allocated from the host, and // the necessary binaries will already be cached. if cli_spec.is_none() { - this.prewarm_postgres()?; + this.prewarm_postgres_vm_memory()?; } // Set the up metric with Empty status before starting the HTTP server. @@ -779,7 +779,7 @@ impl ComputeNode { // Spawn the extension stats background task self.spawn_extension_stats_task(); - if pspec.spec.prewarm_lfc_on_startup { + if pspec.spec.autoprewarm { self.prewarm_lfc(); } Ok(()) @@ -1307,8 +1307,8 @@ impl ComputeNode { } /// Start and stop a postgres process to warm up the VM for startup. - pub fn prewarm_postgres(&self) -> Result<()> { - info!("prewarming"); + pub fn prewarm_postgres_vm_memory(&self) -> Result<()> { + info!("prewarming VM memory"); // Create pgdata let pgdata = &format!("{}.warmup", self.params.pgdata); @@ -1350,7 +1350,7 @@ impl ComputeNode { kill(pm_pid, Signal::SIGQUIT)?; info!("sent SIGQUIT signal"); pg.wait()?; - info!("done prewarming"); + info!("done prewarming vm memory"); // clean up let _ok = fs::remove_dir_all(pgdata); diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index 04b6ed2256..2b865c75d0 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -30,7 +30,7 @@ mod pg_helpers_tests { r#"fsync = off wal_level = logical hot_standby = on -prewarm_lfc_on_startup = off +autoprewarm = off neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501' wal_log_hints = on log_connections = on diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 708745446d..774a0053f8 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -747,7 +747,7 @@ impl Endpoint { logs_export_host: None::, endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), - prewarm_lfc_on_startup: false, + autoprewarm: false, }; // this strange code is needed to support respec() in tests diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 09b550b96c..9b7cf43bb9 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -178,9 +178,9 @@ pub struct ComputeSpec { /// JWT for authorizing requests to endpoint storage service pub endpoint_storage_token: Option, - /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup + /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] - pub prewarm_lfc_on_startup: bool, + pub autoprewarm: bool, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 30e788a601..2dd2aae015 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -85,7 +85,7 @@ "vartype": "bool" }, { - "name": "prewarm_lfc_on_startup", + "name": "autoprewarm", "value": "off", "vartype": "bool" }, From c567ed0de0837bc3923d39ca8706ac353a2ba50b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:41:42 +0800 Subject: [PATCH 4/4] feat(pageserver): feature flag counter metrics (#12112) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Add a counter on the feature evaluation outcome and we will set up alerts for too many failed evaluations in the future. Signed-off-by: Alex Chi Z --- libs/posthog_client_lite/src/lib.rs | 10 ++++++++ pageserver/src/feature_resolver.rs | 36 +++++++++++++++++++++++++---- pageserver/src/metrics.rs | 9 ++++++++ 3 files changed, 50 insertions(+), 5 deletions(-) diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index ff12051196..281a8f8011 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -22,6 +22,16 @@ pub enum PostHogEvaluationError { Internal(String), } +impl PostHogEvaluationError { + pub fn as_variant_str(&self) -> &'static str { + match self { + PostHogEvaluationError::NotAvailable(_) => "not_available", + PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched", + PostHogEvaluationError::Internal(_) => "internal", + } + } +} + #[derive(Deserialize)] pub struct LocalEvaluationResponse { pub flags: Vec, diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 7e31b930d0..7463186c06 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -6,7 +6,7 @@ use posthog_client_lite::{ use tokio_util::sync::CancellationToken; use utils::id::TenantId; -use crate::config::PageServerConf; +use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION}; #[derive(Clone)] pub struct FeatureResolver { @@ -55,11 +55,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_multivariate( + let res = inner.feature_store().evaluate_multivariate( flag_key, &tenant_id.to_string(), &HashMap::new(), - ) + ); + match &res { + Ok(value) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", value]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), @@ -80,11 +93,24 @@ impl FeatureResolver { tenant_id: TenantId, ) -> Result<(), PostHogEvaluationError> { if let Some(inner) = &self.inner { - inner.feature_store().evaluate_boolean( + let res = inner.feature_store().evaluate_boolean( flag_key, &tenant_id.to_string(), &HashMap::new(), - ) + ); + match &res { + Ok(()) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "ok", "true"]) + .inc(); + } + Err(e) => { + FEATURE_FLAG_EVALUATION + .with_label_values(&[flag_key, "error", e.as_variant_str()]) + .inc(); + } + } + res } else { Err(PostHogEvaluationError::NotAvailable( "PostHog integration is not enabled".to_string(), diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4bef9b44a7..3173ab221f 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static FEATURE_FLAG_EVALUATION: Lazy = Lazy::new(|| { + register_counter_vec!( + "pageserver_feature_flag_evaluation", + "Number of times a feature flag is evaluated", + &["flag_key", "status", "value"], + ) + .unwrap() +}); + #[derive(IntoStaticStr)] #[strum(serialize_all = "kebab_case")] pub(crate) enum PageCacheErrorKind {