diff --git a/Cargo.lock b/Cargo.lock index aea8924f4f..5d2cdcea27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2837,6 +2837,7 @@ dependencies = [ "utils", "uuid", "workspace_hack", + "x509-cert", ] [[package]] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index ad8925e7ab..06d5bbb9c5 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -523,11 +523,14 @@ impl ComputeNode { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}", pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), pspec.spec.operation_uuid.as_deref().unwrap_or("None"), pspec.tenant_id, pspec.timeline_id, + pspec.spec.project_id.as_deref().unwrap_or("None"), + pspec.spec.branch_id.as_deref().unwrap_or("None"), + pspec.spec.endpoint_id.as_deref().unwrap_or("None"), pspec.spec.features, pspec.spec.remote_extensions, ); @@ -631,19 +634,23 @@ impl ComputeNode { }); } - // Configure and start rsyslog for HIPAA if necessary - if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { - let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + // Configure and start rsyslog for compliance audit logging + match pspec.spec.audit_log_level { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { + let remote_endpoint = + std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); } - - let log_directory_path = Path::new(&self.params.pgdata).join("log"); - let log_directory_path = log_directory_path.to_string_lossy().to_string(); - configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; - - // Launch a background task to clean up the audit logs - launch_pgaudit_gc(log_directory_path); + _ => {} } // Configure and start rsyslog for Postgres logs export diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 92939f816c..71c6123c3b 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -89,6 +89,15 @@ pub fn write_postgres_conf( escape_conf_value(&s.to_string()) )?; } + if let Some(s) = &spec.project_id { + writeln!(file, "neon.project_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.branch_id { + writeln!(file, "neon.branch_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.endpoint_id { + writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?; + } // tls if let Some(tls_config) = tls_config { @@ -169,7 +178,7 @@ pub fn write_postgres_conf( // and don't allow the user or the control plane admin to change them. match spec.audit_log_level { ComputeAudit::Disabled => {} - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { writeln!(file, "# Managed by compute_ctl base audit settings: start")?; writeln!(file, "pgaudit.log='ddl,role'")?; // Disable logging of catalog queries to reduce the noise @@ -193,16 +202,20 @@ pub fn write_postgres_conf( } writeln!(file, "# Managed by compute_ctl base audit settings: end")?; } - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { writeln!( file, "# Managed by compute_ctl compliance audit settings: begin" )?; - // This log level is very verbose - // but this is necessary for HIPAA compliance. - // Exclude 'misc' category, because it doesn't contain anythig relevant. - writeln!(file, "pgaudit.log='all, -misc'")?; - writeln!(file, "pgaudit.log_parameter=on")?; + // Enable logging of parameters. + // This is very verbose and may contain sensitive data. + if spec.audit_log_level == ComputeAudit::Full { + writeln!(file, "pgaudit.log_parameter=on")?; + writeln!(file, "pgaudit.log='all'")?; + } else { + writeln!(file, "pgaudit.log_parameter=off")?; + writeln!(file, "pgaudit.log='all, -misc'")?; + } // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. writeln!(file, "pgaudit.log_catalog=off")?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index e7d67f6ac5..0d1389dbad 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -278,12 +278,12 @@ impl ComputeNode { // so that all config operations are audit logged. match spec.audit_log_level { - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { phases.push(CreatePgauditExtension); phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { phases.push(CreatePgauditExtension); phases.push(DisablePostgresDBPgAudit); } diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 3137bde161..663c024953 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -658,6 +658,9 @@ impl Endpoint { delta_operations: None, tenant_id: Some(self.tenant_id), timeline_id: Some(self.timeline_id), + project_id: None, + branch_id: None, + endpoint_id: Some(self.endpoint_id.clone()), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 8000576e87..a4b56ae5c0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -13,7 +13,9 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs { pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, - pub generation_override: Option, + pub generation_override: Option, // only new tenants + pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] @@ -805,6 +808,7 @@ impl StorageController { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, + config: None, }; let response = self diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 994a665a88..82950bcbaa 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -104,6 +104,12 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + // More neon ids that we expose to the compute_ctl + // and to postgres as neon extension GUCs. + pub project_id: Option, + pub branch_id: Option, + pub endpoint_id: Option, + /// Safekeeper membership config generation. It is put in /// neon.safekeepers GUC and serves two purposes: /// 1) Non zero value forces walproposer to use membership configurations. @@ -159,13 +165,7 @@ pub struct ComputeSpec { #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, - /// Log level for audit logging: - /// - /// Disabled - no audit logging. This is the default. - /// log - log masked statements to the postgres log using pgaudit extension - /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension - /// - /// Extensions should be present in shared_preload_libraries + /// Log level for compute audit logging #[serde(default)] pub audit_log_level: ComputeAudit, @@ -289,14 +289,25 @@ impl ComputeMode { } /// Log level for audit logging -/// Disabled, log, hipaa -/// Default is Disabled #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { #[default] Disabled, + // Deprecated, use Base instead Log, + // (pgaudit.log = 'ddl', pgaudit.log_parameter='off') + // logged to the standard postgresql log stream + Base, + // Deprecated, use Full or Extended instead Hipaa, + // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off') + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access + Extended, + // (pgaudit.log='all', pgaudit.log_parameter='on'), + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access. + Full, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 6d24ee352a..5f6578f76e 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -30,6 +30,7 @@ tokio.workspace = true tracing.workspace = true url.workspace = true uuid.workspace = true +x509-cert.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs index 07fd56ac01..f93f71c962 100644 --- a/libs/http-utils/src/server.rs +++ b/libs/http-utils/src/server.rs @@ -4,6 +4,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use hyper0::Body; use hyper0::server::conn::Http; +use metrics::{IntCounterVec, register_int_counter_vec}; +use once_cell::sync::Lazy; use routerify::{RequestService, RequestServiceBuilder}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; @@ -26,6 +28,24 @@ pub struct Server { tls_acceptor: Option, } +static CONNECTION_STARTED_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_started_total", + "Number of established http/https connections", + &["scheme"] + ) + .expect("failed to define a metric") +}); + +static CONNECTION_ERROR_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_errors_total", + "Number of occured connection errors by type", + &["type"] + ) + .expect("failed to define a metric") +}); + impl Server { pub fn new( request_service: Arc>, @@ -60,6 +80,15 @@ impl Server { false } + let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]); + let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]); + let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]); + let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]); + let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]); + + let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]); + let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]); + let mut connections = FuturesUnordered::new(); loop { tokio::select! { @@ -67,6 +96,7 @@ impl Server { let (tcp_stream, remote_addr) = match stream { Ok(stream) => stream, Err(err) => { + tcp_error_cnt.inc(); if !suppress_io_error(&err) { info!("Failed to accept TCP connection: {err:#}"); } @@ -78,11 +108,18 @@ impl Server { let tls_acceptor = self.tls_acceptor.clone(); let cancel = cancel.clone(); + let tls_error_cnt = tls_error_cnt.clone(); + let http_error_cnt = http_error_cnt.clone(); + let https_error_cnt = https_error_cnt.clone(); + let http_connection_cnt = http_connection_cnt.clone(); + let https_connection_cnt = https_connection_cnt.clone(); + connections.push(tokio::spawn( async move { match tls_acceptor { Some(tls_acceptor) => { // Handle HTTPS connection. + https_connection_cnt.inc(); let tls_stream = tokio::select! { tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, _ = cancel.cancelled() => return, @@ -90,6 +127,7 @@ impl Server { let tls_stream = match tls_stream { Ok(tls_stream) => tls_stream, Err(err) => { + tls_error_cnt.inc(); if !suppress_io_error(&err) { info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } @@ -97,6 +135,7 @@ impl Server { } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + https_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } @@ -104,7 +143,9 @@ impl Server { } None => { // Handle HTTP connection. + http_connection_cnt.inc(); if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + http_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } @@ -115,6 +156,7 @@ impl Server { } Some(conn) = connections.next() => { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } @@ -122,6 +164,7 @@ impl Server { // Wait for graceful shutdown of all connections. while let Some(conn) = connections.next().await { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs index 0c18d84d98..2799db78a6 100644 --- a/libs/http-utils/src/tls_certs.rs +++ b/libs/http-utils/src/tls_certs.rs @@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration}; use anyhow::Context; use arc_swap::ArcSwap; use camino::Utf8Path; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use once_cell::sync::Lazy; use rustls::{ - pki_types::{CertificateDer, PrivateKeyDer}, + pki_types::{CertificateDer, PrivateKeyDer, UnixTime}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; +use x509_cert::der::Reader; pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { let cert_data = tokio::fs::read(filename) @@ -53,6 +56,76 @@ pub async fn load_certified_key( Ok(certified_key) } +/// rustls's CertifiedKey with extra parsed fields used for metrics. +struct ParsedCertifiedKey { + certified_key: CertifiedKey, + expiration_time: UnixTime, +} + +/// Parse expiration time from an X509 certificate. +fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result { + let parsed_cert = x509_cert::der::SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + Ok(UnixTime::since_unix_epoch( + parsed_cert + .tbs_certificate + .validity + .not_after + .to_unix_duration(), + )) +} + +async fn load_and_parse_certified_key( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, +) -> anyhow::Result { + let certified_key = load_certified_key(key_filename, cert_filename).await?; + let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?; + Ok(ParsedCertifiedKey { + certified_key, + expiration_time, + }) +} + +static CERT_EXPIRATION_TIME: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "tls_certs_expiration_time_seconds", + "Expiration time of the loaded certificate since unix epoch in seconds", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_started_total", + "Number of certificate reload loop iterations started", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_UPDATED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_updated_total", + "Number of times the certificate was updated to the new one", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_FAILED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_failed_total", + "Number of times the certificate reload failed", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from /// the disk periodically. #[derive(Debug)] @@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver { impl ReloadingCertificateResolver { /// Creates a new Resolver by loading certificate and private key from FS and /// creating tokio::task to reload them with provided reload_period. + /// resolver_name is used as metric's label. pub async fn new( + resolver_name: &str, key_filename: &Utf8Path, cert_filename: &Utf8Path, reload_period: Duration, ) -> anyhow::Result> { + // Create metrics for current resolver. + let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]); + let cert_reload_started_counter = + CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_updated_counter = + CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_failed_counter = + CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]); + + let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?; + let this = Arc::new(Self { - certified_key: ArcSwap::from_pointee( - load_certified_key(key_filename, cert_filename).await?, - ), + certified_key: ArcSwap::from_pointee(parsed_key.certified_key), }); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); tokio::spawn({ let weak_this = Arc::downgrade(&this); @@ -88,17 +173,22 @@ impl ReloadingCertificateResolver { Some(this) => this, None => break, // Resolver has been destroyed, exit. }; - match load_certified_key(&key_filename, &cert_filename).await { - Ok(new_certified_key) => { - if new_certified_key.cert == this.certified_key.load().cert { + cert_reload_started_counter.inc(); + + match load_and_parse_certified_key(&key_filename, &cert_filename).await { + Ok(parsed_key) => { + if parsed_key.certified_key.cert == this.certified_key.load().cert { tracing::debug!("Certificate has not changed since last reloading"); } else { tracing::info!("Certificate has been reloaded"); - this.certified_key.store(Arc::new(new_certified_key)); + this.certified_key.store(Arc::new(parsed_key.certified_key)); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); + cert_reload_updated_counter.inc(); } last_reload_failed = false; } Err(err) => { + cert_reload_failed_counter.inc(); // Note: Reloading certs may fail if it conflicts with the script updating // the files at the same time. Warn only if the error is persistent. if last_reload_failed { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 8f56d60a4a..bd9f7efb7f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -180,6 +180,7 @@ pub struct ConfigToml { #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, pub tracing: Option, + pub enable_tls_page_service_api: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -631,6 +632,7 @@ impl Default for ConfigToml { load_previous_heatmap: None, generate_unarchival_heatmap: None, tracing: None, + enable_tls_page_service_api: false, } } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 3cb62f9d18..91f9c03ba4 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -7,7 +7,8 @@ use std::time::{Duration, Instant}; /// API (`/control/v1` prefix). Implemented by the server /// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; use crate::shard::{ShardStripeSize, TenantShardId}; @@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, } +/// Import request for safekeeper timelines. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineImportRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub start_lsn: Lsn, + pub sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8836e7ec87..0c4d7fd4cb 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -927,7 +927,7 @@ impl Key { /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] - pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> { Ok(match self.field1 { 0x00 => ( RelTag { @@ -938,7 +938,7 @@ impl Key { }, self.field6, ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + _ => return Err(ToRelBlockError(self.field1)), }) } } @@ -951,6 +951,17 @@ impl std::str::FromStr for Key { } } +#[derive(Debug)] +pub struct ToRelBlockError(u8); + +impl fmt::Display for ToRelBlockError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "unexpected value kind 0x{:02x}", self.0) + } +} + +impl std::error::Error for ToRelBlockError {} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 54fecee588..250d4180f5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -452,6 +452,24 @@ fn start_pageserver( info!("Using auth for http API: {:#?}", conf.http_auth_type); info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); + let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api + { + let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( + "main", + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ))?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_cert_resolver(resolver); + + Some(Arc::new(server_config)) + } else { + None + }; + match var("NEON_AUTH_TOKEN") { Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); @@ -670,17 +688,11 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new( - &conf.ssl_key_file, - &conf.ssl_cert_file, - conf.ssl_cert_reload_period, - ))?; + let tls_server_config = tls_server_config + .clone() + .expect("tls_server_config is set earlier if https is enabled"); - let server_config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_cert_resolver(resolver); - - let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config); let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; @@ -736,6 +748,11 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener) .context("create tokio listener")? }, + if conf.enable_tls_page_service_api { + tls_server_config + } else { + None + }, ); // All started up! Now just sit and wait for shutdown signal. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ccc29e59d4..26ae6af70e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -219,6 +219,11 @@ pub struct PageServerConf { pub generate_unarchival_heatmap: bool, pub tracing: Option, + + /// Enable TLS in page service API. + /// Does not force TLS: the client negotiates TLS usage during the handshake. + /// Uses key and certificate from ssl_key_file/ssl_cert_file. + pub enable_tls_page_service_api: bool, } /// Token for authentication to safekeepers @@ -391,6 +396,7 @@ impl PageServerConf { load_previous_heatmap, generate_unarchival_heatmap, tracing, + enable_tls_page_service_api, } = config_toml; let mut conf = PageServerConf { @@ -441,6 +447,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, tracing, + enable_tls_page_service_api, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9bb761dc48..bbc4bfae1b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3253,7 +3253,7 @@ async fn ingest_aux_files( modification .put_file(&fname, content.as_bytes(), &ctx) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; } modification .commit(&ctx) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6dd005de50..911449c7c5 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -27,7 +27,7 @@ use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; -use crate::walingest::WalIngest; +use crate::walingest::{WalIngest, WalIngestErrorKind}; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { @@ -157,9 +157,9 @@ async fn import_rel( .put_rel_creation(rel, nblocks as u32, ctx) .await { - match e { - RelationError::AlreadyExists => { - debug!("Relation {} already exist. We must be extending it.", rel) + match e.kind { + WalIngestErrorKind::RelationAlreadyExists(rel) => { + debug!("Relation {rel} already exists. We must be extending it.") } _ => return Err(e.into()), } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7e3991dbdc..61f524fc29 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -105,6 +105,7 @@ pub fn spawn( pg_auth: Option>, perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, + tls_config: Option>, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -124,6 +125,7 @@ pub fn spawn( perf_trace_dispatch, tcp_listener, conf.pg_auth_type, + tls_config, conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), @@ -181,6 +183,7 @@ pub async fn libpq_listener_main( perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, @@ -223,6 +226,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, + tls_config.clone(), pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), @@ -264,6 +268,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, @@ -334,7 +339,8 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; + let pgbackend = + PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4c5a07ba57..f33a8baec1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,8 +9,9 @@ use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; -use crate::PERF_TRACE_TARGET; -use anyhow::{Context, ensure}; +use crate::walingest::{WalIngestError, WalIngestErrorKind}; +use crate::{PERF_TRACE_TARGET, ensure_walingest}; +use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; @@ -136,12 +137,8 @@ impl From for CalculateLogicalSizeError { #[derive(Debug, thiserror::Error)] pub enum RelationError { - #[error("Relation Already Exists")] - AlreadyExists, #[error("invalid relnode")] InvalidRelnode, - #[error(transparent)] - Other(#[from] anyhow::Error), } /// @@ -1478,8 +1475,8 @@ impl DatadirModification<'_> { } /// Set the current lsn - pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { - ensure!( + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> { + ensure_walingest!( lsn >= self.lsn, "setting an older lsn {} than {} is not allowed", lsn, @@ -1578,7 +1575,7 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, ctx: &RequestContext, - ) -> Result { + ) -> Result { // Get current size and put rel creation if rel doesn't exist // // NOTE: we check the cache first even though get_rel_exists and get_rel_size would @@ -1593,14 +1590,13 @@ impl DatadirModification<'_> { .await? { // create it with 0 size initially, the logic below will extend it - self.put_rel_creation(rel, 0, ctx) - .await - .context("Relation Error")?; + self.put_rel_creation(rel, 0, ctx).await?; Ok(0) } else { - self.tline + Ok(self + .tline .get_rel_size(rel, Version::Modified(self), ctx) - .await + .await?) } } @@ -1637,11 +1633,14 @@ impl DatadirModification<'_> { // TODO(vlad): remove this argument and replace the shard check with is_key_local shard: &ShardIdentity, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let mut gaps_at_lsns = Vec::default(); for meta in batch.metadata.iter() { - let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?; + let key = Key::from_compact(meta.key()); + let (rel, blkno) = key + .to_rel_block() + .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?; let new_nblocks = blkno + 1; let old_nblocks = self.create_relation_if_required(rel, ctx).await?; @@ -1683,8 +1682,8 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -1696,7 +1695,7 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { if !self.tline.tenant_shard_id.is_shard_zero() { return Ok(()); } @@ -1714,14 +1713,11 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) @@ -1733,15 +1729,12 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(key, Value::Image(img)); Ok(()) @@ -1751,15 +1744,11 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, blknum: BlockNumber, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1776,15 +1765,11 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1832,8 +1817,10 @@ impl DatadirModification<'_> { dbnode: Oid, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; @@ -1874,13 +1861,13 @@ impl DatadirModification<'_> { xid: u64, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1891,7 +1878,7 @@ impl DatadirModification<'_> { let xid = xid as u32; let mut dir = TwoPhaseDirectory::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1909,22 +1896,22 @@ impl DatadirModification<'_> { &mut self, origin_id: RepOriginId, origin_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = repl_origin_key(origin_id); self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); Ok(()) } - pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> { self.set_replorigin(origin_id, Lsn::INVALID).await } - pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } @@ -1934,7 +1921,7 @@ impl DatadirModification<'_> { spcnode: Oid, dbnode: Oid, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let total_blocks = self .tline .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) @@ -1973,20 +1960,21 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> Result<(), RelationError> { + ) -> Result<(), WalIngestError> { if rel.relnode == 0 { - return Err(RelationError::InvalidRelnode); + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( + "invalid relnode" + )))?; } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) - .context("deserialize db")?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), @@ -2003,27 +1991,25 @@ impl DatadirModification<'_> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; - let v2_enabled = self.maybe_enable_rel_size_v2()?; + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_enabled { if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 - let val = self - .sparse_get(sparse_rel_dir_key, ctx) - .await - .map_err(|e| RelationError::Other(e.into()))?; + let val = self.sparse_get(sparse_rel_dir_key, ctx).await?; let val = RelDirExists::decode_option(val) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?; if val == RelDirExists::Exists { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } self.put( sparse_rel_dir_key, @@ -2039,9 +2025,7 @@ impl DatadirModification<'_> { // will be key not found errors if we don't create an empty one for rel_size_v2. self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&RelDirectory::default()).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)), ); } self.pending_directory_entries @@ -2049,7 +2033,7 @@ impl DatadirModification<'_> { } else { // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } if !dbdir_exists { self.pending_directory_entries @@ -2059,9 +2043,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), ); } @@ -2086,8 +2068,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline .get_rel_exists(rel, Version::Modified(self), ctx) @@ -2117,8 +2099,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); // Put size let size_key = rel_size_to_key(rel); @@ -2142,8 +2124,10 @@ impl DatadirModification<'_> { &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2163,7 +2147,7 @@ impl DatadirModification<'_> { let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?; if val == RelDirExists::Exists { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); @@ -2206,7 +2190,7 @@ impl DatadirModification<'_> { segno: u32, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Add it to the directory entry @@ -2215,7 +2199,7 @@ impl DatadirModification<'_> { let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - anyhow::bail!("slru segment {kind:?}/{segno} already exists"); + Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?; } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), @@ -2242,7 +2226,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Put size @@ -2258,7 +2242,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -2283,7 +2267,7 @@ impl DatadirModification<'_> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> { // TODO Ok(()) } @@ -2293,7 +2277,7 @@ impl DatadirModification<'_> { &mut self, xid: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { @@ -2308,7 +2292,8 @@ impl DatadirModification<'_> { )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { - let xid: u32 = u32::try_from(xid)?; + let xid: u32 = u32::try_from(xid) + .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -2333,7 +2318,7 @@ impl DatadirModification<'_> { path: &str, content: &[u8], ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine let old_val = match self.get(key, ctx).await { @@ -2342,7 +2327,7 @@ impl DatadirModification<'_> { Err(e) => return Err(e.into()), }; let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)? } else { Vec::new() }; @@ -2387,7 +2372,8 @@ impl DatadirModification<'_> { } (None, true) => warn!("removing non-existing aux file: {}", path), } - let new_val = aux_file::encode_file_value(&new_files)?; + let new_val = aux_file::encode_file_value(&new_files) + .map_err(WalIngestErrorKind::EncodeAuxFileError)?; self.put(key, Value::Image(new_val.into())); Ok(()) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d3623fc3b9..ad4a0d804d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11571,6 +11571,99 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x24), + Value::WalRecord(NeonWalRecord::wal_append("@0x24")), + ), + ( + get_key(1), + Lsn(0x28), + // This record will fail to redo + Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // in-memory layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + delta1, + )], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let cancel = CancellationToken::new(); + + // Compaction will fail, but should not fire any critical error. + // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction + // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire + // compaction job. Tracked in . + let res = tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: None, + compact_lsn_range: None, + ..Default::default() + }, + &ctx, + ) + .await; + assert!(res.is_err()); + + Ok(()) + } + #[cfg(feature = "testing")] #[tokio::test] async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 228048ac9c..490b1ce664 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1269,7 +1269,7 @@ mod test { // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap(); let range = input_start..input_end; // Build an image layer to filter @@ -1314,7 +1314,7 @@ mod test { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), shard_count, - ShardStripeSize(0x8000), + ShardStripeSize(0x800), ) .unwrap(); let harness = TenantHarness::create_custom( @@ -1368,12 +1368,12 @@ mod test { // This exact size and those below will need updating as/when the layer encoding changes, but // should be deterministic for a given version of the format, as we used no randomness generating the input. - assert_eq!(original_size, 1597440); + assert_eq!(original_size, 122880); match shard_number { 0 => { // We should have written out just one stripe for our shard identity - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); // We should have dropped some of the data @@ -1381,7 +1381,7 @@ mod test { assert!(replacement.metadata().file_size > 0); // Assert that we dropped ~3/4 of the data. - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 1 => { // Shard 1 has no keys in our input range @@ -1390,19 +1390,19 @@ mod test { } 2 => { // Shard 2 has one stripes in the input range - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 3 => { // Shard 3 has two stripes in the input range - assert_eq!(wrote_keys, 0x10000); + assert_eq!(wrote_keys, 0x1000); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 811008); + assert_eq!(replacement.metadata().file_size, 73728); } _ => unreachable!(), } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8e778e8674..3b38712b0b 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1141,6 +1141,7 @@ impl Timeline { ) -> Result<(), CompactionError> { let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); + let mut rewrite_max_exceeded: bool = false; // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a // layer is behind this Lsn, it indicates that the layer is being retained beyond the @@ -1149,12 +1150,7 @@ impl Timeline { // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); - - tracing::info!( - "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}", - *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.time - ); + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; let layers = self.layers.read().await; for layer_desc in layers.layer_map()?.iter_historic_layers() { @@ -1172,8 +1168,8 @@ impl Timeline { // This ancestral layer only covers keys that belong to other shards. // We include the full metadata in the log: if we had some critical bug that caused // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. - info!(%layer, old_metadata=?layer.metadata(), - "dropping layer after shard split, contains no keys for this shard.", + debug!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard", ); if cfg!(debug_assertions) { @@ -1235,9 +1231,10 @@ impl Timeline { } if layers_to_rewrite.len() >= rewrite_max { - tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", layers_to_rewrite.len() ); + rewrite_max_exceeded = true; continue; } @@ -1245,9 +1242,24 @@ impl Timeline { layers_to_rewrite.push(layer); } - // Drop read lock on layer map before we start doing time-consuming I/O + // Drop read lock on layer map before we start doing time-consuming I/O. drop(layers); + // Drop out early if there's nothing to do. + if layers_to_rewrite.is_empty() && drop_layers.is_empty() { + return Ok(()); + } + + info!( + "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \ + (latest_gc_cutoff={} pitr_cutoff={})", + layers_to_rewrite.len(), + drop_layers.len(), + *latest_gc_cutoff, + pitr_cutoff, + ); + let started = Instant::now(); + let mut replace_image_layers = Vec::new(); for layer in layers_to_rewrite { @@ -1255,7 +1267,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + info!(layer=%layer, "rewriting layer after shard split"); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -1295,7 +1307,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let new_layer = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; - tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + info!(layer=%new_layer, "rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -1307,6 +1319,12 @@ impl Timeline { } } + for layer in &drop_layers { + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split (no keys for this shard)", + ); + } + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch // to remote index) and be removed. This is inefficient but safe. @@ -1322,6 +1340,7 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. + info!("shard ancestor compaction waiting for uploads"); match self.remote_client.wait_completion().await { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), @@ -1330,6 +1349,15 @@ impl Timeline { } } + info!( + "shard ancestor compaction done in {:.3}s{}", + started.elapsed().as_secs_f64(), + match rewrite_max_exceeded { + true => format!(", more work pending due to rewrite_max={rewrite_max}"), + false => String::new(), + } + ); + fail::fail_point!("compact-shard-ancestors-persistent"); Ok(()) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index df2663f6bb..3c3608d1bd 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -580,6 +580,7 @@ impl ConnectionManagerState { ); Ok(()) } + WalReceiverError::Cancelled => Ok(()), WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 6bf05a0f86..52259f205b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -73,6 +73,7 @@ pub(super) enum WalReceiverError { /// Generic error Other(anyhow::Error), ClosedGate, + Cancelled, } impl From for WalReceiverError { @@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::Cancelled => { + debug!("Connection cancelled") + } WalReceiverError::ClosedGate => { // doesn't happen at runtime } @@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) + .await + .map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let shard = vec![*timeline.get_shard_identity()]; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 18df065f76..e60c590f87 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,13 +21,13 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::backtrace::Backtrace; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; -use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -38,7 +38,7 @@ use postgres_ffi::{ fsm_logical_to_physical, pg_constants, }; use tracing::*; -use utils::bin_ser::SerializeError; +use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; @@ -104,12 +104,101 @@ struct WarnIngestLag { timestamp_invalid_msg_ratelimit: RateLimit, } +pub struct WalIngestError { + pub backtrace: std::backtrace::Backtrace, + pub kind: WalIngestErrorKind, +} + +#[derive(thiserror::Error, Debug)] +pub enum WalIngestErrorKind { + #[error(transparent)] + #[allow(private_interfaces)] + PageReconstructError(#[from] PageReconstructError), + #[error(transparent)] + DeserializationFailure(#[from] DeserializeError), + #[error(transparent)] + SerializationFailure(#[from] SerializeError), + #[error("the request contains data not supported by pageserver: {0} @ {1}")] + InvalidKey(Key, Lsn), + #[error("twophase file for xid {0} already exists")] + FileAlreadyExists(u64), + #[error("slru segment {0:?}/{1} already exists")] + SlruAlreadyExists(SlruKind, u32), + #[error("relation already exists")] + RelationAlreadyExists(RelTag), + #[error("invalid reldir key {0}")] + InvalidRelDirKey(Key), + + #[error(transparent)] + LogicalError(anyhow::Error), + #[error(transparent)] + EncodeAuxFileError(anyhow::Error), + #[error(transparent)] + MaybeRelSizeV2Error(anyhow::Error), + + #[error("timeline shutting down")] + Cancelled, +} + +impl From for WalIngestError +where + WalIngestErrorKind: From, +{ + fn from(value: T) -> Self { + WalIngestError { + backtrace: Backtrace::capture(), + kind: WalIngestErrorKind::from(value), + } + } +} + +impl std::error::Error for WalIngestError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.kind.source() + } +} + +impl core::fmt::Display for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +impl core::fmt::Debug for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if f.alternate() { + f.debug_map() + .key(&"backtrace") + .value(&self.backtrace) + .key(&"kind") + .value(&self.kind) + .finish() + } else { + writeln!(f, "Error: {:?}", self.kind)?; + if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured { + writeln!(f, "Stack backtrace: {:?}", self.backtrace)?; + } + Ok(()) + } + } +} + +#[macro_export] +macro_rules! ensure_walingest { + ($($t:tt)*) => { + _ = || -> Result<(), anyhow::Error> { + anyhow::ensure!($($t)*); + Ok(()) + }().map_err(WalIngestErrorKind::LogicalError)?; + }; +} + impl WalIngest { pub async fn new( timeline: &Timeline, startpoint: Lsn, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -145,7 +234,7 @@ impl WalIngest { interpreted: InterpretedWalRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { WAL_INGEST.records_received.inc(); let prev_len = modification.len(); @@ -288,7 +377,7 @@ impl WalIngest { } /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL - fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { let next_full_xid = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); @@ -298,9 +387,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!( + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" - ); + )))?; } epoch -= 1; } @@ -313,7 +402,7 @@ impl WalIngest { clear_vm_bits: ClearVmBits, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClearVmBits { new_heap_blkno, old_heap_blkno, @@ -402,7 +491,7 @@ impl WalIngest { create: DbaseCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseCreate { db_id, tablespace_id, @@ -505,7 +594,7 @@ impl WalIngest { dbase_drop: DbaseDrop, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseDrop { db_id, tablespace_ids, @@ -523,7 +612,7 @@ impl WalIngest { create: SmgrCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let SmgrCreate { rel } = create; self.put_rel_creation(modification, rel, ctx).await?; Ok(()) @@ -537,7 +626,7 @@ impl WalIngest { truncate: XlSmgrTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let XlSmgrTruncate { blkno, rnode, @@ -689,7 +778,7 @@ impl WalIngest { record: XactRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { let xid: u64 = if modification.tline.pg_version >= 17 { @@ -813,7 +902,7 @@ impl WalIngest { truncate: ClogTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogTruncate { pageno, oldest_xid, @@ -889,7 +978,7 @@ impl WalIngest { zero_page: ClogZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogZeroPage { segno, rpageno } = zero_page; self.put_slru_page_image( @@ -907,7 +996,7 @@ impl WalIngest { &mut self, modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { // Create WAL record for updating the multixact-offsets page let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1010,7 +1099,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let (maxsegment, startsegment, endsegment) = enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestMulti = xlrec.end_trunc_off; @@ -1058,7 +1147,7 @@ impl WalIngest { zero_page: MultiXactZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let MultiXactZeroPage { slru_kind, segno, @@ -1080,7 +1169,7 @@ impl WalIngest { update: RelmapUpdate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RelmapUpdate { update, buf } = update; modification @@ -1093,7 +1182,7 @@ impl WalIngest { raw_record: RawXlogRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RawXlogRecord { info, lsn, mut buf } = raw_record; let pg_version = modification.tline.pg_version; @@ -1235,12 +1324,12 @@ impl WalIngest { put: PutLogicalMessage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let PutLogicalMessage { path, buf } = put; modification.put_file(path.as_str(), &buf, ctx).await } - fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> { + fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> { match record { StandbyRecord::RunningXacts(running_xacts) => { enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { @@ -1258,7 +1347,7 @@ impl WalIngest { &mut self, record: ReploriginRecord, modification: &mut DatadirModification<'_>, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { match record { ReploriginRecord::Set(set) => { modification @@ -1278,7 +1367,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } @@ -1291,7 +1380,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_page_image(rel, blknum, img)?; @@ -1305,7 +1394,7 @@ impl WalIngest { blknum: BlockNumber, rec: NeonWalRecord, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_wal_record(rel, blknum, rec)?; @@ -1318,7 +1407,7 @@ impl WalIngest { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } @@ -1329,7 +1418,7 @@ impl WalIngest { rel: RelTag, blknum: BlockNumber, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. @@ -1423,7 +1512,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { if !self.shard.is_shard_zero() { return Ok(()); } @@ -1441,7 +1530,7 @@ impl WalIngest { segno: u32, blknum: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1509,6 +1598,7 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { + use anyhow::Result; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1530,7 +1620,7 @@ mod tests { } #[tokio::test] - async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { for i in 14..=16 { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 9ea708f29a..dfabb6919e 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = { /* GUCs */ char *neon_timeline; char *neon_tenant; +char *neon_project_id; +char *neon_branch_id; +char *neon_endpoint_id; int32 max_cluster_size; char *page_server_connstring; char *neon_auth_token; @@ -1352,6 +1355,31 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.project_id", + "Neon project_id the server is running on", + NULL, + &neon_project_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.branch_id", + "Neon branch_id the server is running on", + NULL, + &neon_branch_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.endpoint_id", + "Neon endpoint_id the server is running on", + NULL, + &neon_endpoint_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", "sharding stripe size", NULL, diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 0336d63e8d..6b133e4dc4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; wp->state = WPS_COLLECTING_TERMS; + wp->mconf.generation = INVALID_GENERATION; + wp->mconf.members.len = 0; + wp->mconf.new_members.len = 0; wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); @@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->config->proto_version != 2 && wp->config->proto_version != 3) wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) + wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); /* Fill the greeting package */ @@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp) static bool WalProposerGenerationsEnabled(WalProposer *wp) { - return wp->safekeepers_generation != 0; + return wp->safekeepers_generation != INVALID_GENERATION; } /* @@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk) BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } +/* + * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in + * members_safekeepers & new_members_safekeepers to sk. + */ +static void +UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) +{ + /* members_safekeepers etc are fixed size, sanity check mconf size */ + if (wp->mconf.members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len); + if (wp->mconf.new_members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len); + + /* node id is not known until greeting is received */ + if (sk->state < SS_WAIT_VOTING) + return; + + /* 0 is assumed to be invalid node id, should never happen */ + if (sk->greetResponse.nodeId == 0) + { + wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port); + return; + } + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.members.m[i]; + + if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + { + /* + * If mconf or list of safekeepers to connect to changed (the + * latter always currently goes through restart though), + * ResetMemberSafekeeperPtrs is expected to be called before + * UpdateMemberSafekeeperPtr. So, other value suggests that we are + * connected to the same sk under different host name, complain + * about that. + */ + if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->members_safekeepers[i] = sk; + } + } + /* repeat for new_members */ + for (uint32 i = 0; i < wp->mconf.new_members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; + + if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + { + if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->new_members_safekeepers[i] = sk; + } + } +} + +/* + * Reset wp->members_safekeepers & new_members_safekeepers and refill them. + * Called after wp changes mconf. + */ +static void +ResetMemberSafekeeperPtrs(WalProposer *wp) +{ + memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state >= SS_WAIT_VOTING) + UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]); + } +} + +static uint32 +MsetQuorum(MemberSet *mset) +{ + Assert(mset->len > 0); + return mset->len / 2 + 1; +} + +/* Does n forms quorum in mset? */ +static bool +MsetHasQuorum(MemberSet *mset, uint32 n) +{ + return n >= MsetQuorum(mset); +} + +/* + * TermsCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_greeted = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_VOTING) + { + if (n_greeted > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_greeted++; + } + } + appendStringInfo(s, ", %u/%u total", n_greeted, mset->len); + return MsetHasQuorum(mset, n_greeted); +} + /* * Have we received greeting from enough (quorum) safekeepers to start voting? */ static bool TermsCollected(WalProposer *wp) { - return wp->n_connected >= wp->quorum; + StringInfoData s; /* str for logging */ + bool collected = false; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + collected = wp->n_connected >= wp->quorum; + if (collected) + { + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm); + } + return collected; + } + + /* + * With generations enabled, we start campaign only when 1) some mconf is + * actually received 2) we have greetings from majority of members as well + * as from majority of new_members if it exists. + */ + if (wp->mconf.generation == INVALID_GENERATION) + return false; + + initStringInfo(&s); + appendStringInfoString(&s, "mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm); + collected = true; + +res: + pfree(s.data); + return collected; } static void @@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk) pfree(mconf_toml); /* - * Adopt mconf of safekeepers if it is higher. TODO: mconf change should - * restart wp if it started voting. + * Adopt mconf of safekeepers if it is higher. */ if (sk->greetResponse.mconf.generation > wp->mconf.generation) { + /* sanity check before adopting, should never happen */ + if (sk->greetResponse.mconf.members.len == 0) + { + wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation); + } + + /* + * If we at least started campaign, restart wp to get elected in the + * new mconf. Note: in principle once wp is already elected + * re-election is not required, but being conservative here is not + * bad. + * + * TODO: put mconf to shmem to immediately pick it up on start, + * otherwise if some safekeeper(s) misses latest mconf and gets + * connected the first, it may cause redundant restarts here. + * + * More generally, it would be nice to restart walproposer (wiping + * election state) without restarting the process. In particular, that + * would allow sync-safekeepers not to die here if it intersected with + * sk migration (as well as remove 1s delay). + * + * Note that assign_neon_safekeepers also currently restarts the + * process, so during normal migration walproposer may restart twice. + */ + if (wp->state >= WPS_CAMPAIGN) + { + wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation); + } MembershipConfigurationFree(&wp->mconf); MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + ResetMemberSafekeeperPtrs(wp); /* full conf was just logged above */ wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); } @@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* Protocol is all good, move to voting. */ sk->state = SS_WAIT_VOTING; + /* In greeting safekeeper sent its id; update mappings accordingly. */ + UpdateMemberSafekeeperPtr(wp, sk); + /* * Note: it would be better to track the counter on per safekeeper basis, * but at worst walproposer would restart with 'term rejected', so leave @@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); - /* Quorum is acquried, prepare the vote request. */ + /* Quorum is acquired, prepare the vote request. */ if (TermsCollected(wp)) { - wp->propTerm++; - wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->state = WPS_CAMPAIGN; wp->voteRequest.pam.tag = 'v'; wp->voteRequest.generation = wp->mconf.generation; @@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk) &sk->outbuf, wp->config->proto_version); /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, - wp->voteRequest.generation, wp->voteRequest.term); + wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT, + sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term); /* On failure, logging & resetting is handled */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ @@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), @@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk) } } +/* + * VotesCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_votes = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_ELECTED) + { + Assert(sk->voteResponse.voteGiven); + + /* + * Find the highest vote. NULL check is for the legacy case where + * safekeeper might be not initialized with LSN at all and return + * 0 LSN in the vote response; we still want to set donor to + * something in this case. + */ + if (GetLastLogTerm(sk) > wp->donorLastLogTerm || + (GetLastLogTerm(sk) == wp->donorLastLogTerm && + sk->voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(sk); + wp->propTermStartLsn = sk->voteResponse.flushLsn; + wp->donor = sk; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + + if (n_votes > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_votes++; + } + } + appendStringInfo(s, ", %u/%u total", n_votes, mset->len); + return MsetHasQuorum(mset, n_votes); +} + + /* * Checks if enough votes has been collected to get elected and if that's the * case finds the highest vote, setting donor, donorLastLogTerm, @@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk) static bool VotesCollected(WalProposer *wp) { - int n_ready = 0; + StringInfoData s; /* str for logging */ + bool collected = false; /* assumed to be called only when not elected yet */ Assert(wp->state == WPS_CAMPAIGN); @@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp) wp->donorLastLogTerm = 0; wp->truncateLsn = InvalidXLogRecPtr; - for (int i = 0; i < wp->n_safekeepers; i++) + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { - if (wp->safekeeper[i].state == SS_WAIT_ELECTED) - { - n_ready++; + int n_ready = 0; - if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || - (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && - wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn)) + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) { - wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); - wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; - wp->donor = i; + n_ready++; + + if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || + (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && + wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); + wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = &wp->safekeeper[i]; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } + collected = n_ready >= wp->quorum; + if (collected) + { + wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers); + } + return collected; } - return n_ready >= wp->quorum; + /* + * if generations are enabled we're expected to get to voting only when + * mconf is established. + */ + Assert(wp->mconf.generation != INVALID_GENERATION); + + /* + * We must get votes from both msets if both are present. + */ + initStringInfo(&s); + appendStringInfoString(&s, "mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp_log(LOG, "walproposer elected, %s", s.data); + collected = true; + +res: + pfree(s.data); + return collected; } /* @@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp) * that only for logical replication (and switching logical walsenders to * neon_walreader is a todo.) */ - if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) + if (!wp->api.recovery_download(wp, wp->donor)) { wp_log(FATAL, "failed to download WAL for logical replicaiton"); } @@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp) /* * Proposer's term history is the donor's + its own entry. */ - dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; + dth = &wp->donor->voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); if (dth->n_entries > 0) @@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; - wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, + wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->propTerm, LSN_FORMAT_ARGS(wp->propTermStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + wp->donor->host, wp->donor->port, LSN_FORMAT_ARGS(wp->truncateLsn)); /* @@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk) readAnything = true; + /* should never happen: sk is expected to send ERROR instead */ + if (sk->appendResponse.generation != wp->mconf.generation) + { + wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u", + sk->greetResponse.nodeId, sk->host, sk->port, + sk->appendResponse.generation, wp->mconf.generation); + } + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp) } /* - * Calculate WAL position acknowledged by quorum + * GetAcknowledgedByQuorumWALPosition for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. */ static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* - * Sort acknowledged LSNs + * Ascending sort acknowledged LSNs. */ - for (int i = 0; i < wp->n_safekeepers; i++) + Assert(mset->len <= MAX_SAFEKEEPERS); + for (uint32 i = 0; i < mset->len; i++) { + Safekeeper *sk = msk[i]; + /* * Like in Raft, we aren't allowed to commit entries from previous - * terms, so ignore reported LSN until it gets to epochStartLsn. + * terms, so ignore reported LSN until it gets to propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack flushLsn is + * 0, and later we just preserve value across reconnections. It would + * be ok to check for SS_ACTIVE as well. */ - responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn) + { + responses[i] = sk->appendResponse.flushLsn; + } + else + { + responses[i] = 0; + } } - qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn); /* - * Get the smallest LSN committed by quorum + * And get value committed by the quorum. A way to view this: to get the + * highest value committed on the quorum, in the ordered array we skip n - + * n_quorum elements to get to the first (lowest) value present on all sks + * of the highest quorum. */ - return responses[wp->n_safekeepers - wp->quorum]; + return responses[mset->len - MsetQuorum(mset)]; +} + +/* + * Calculate WAL position acknowledged by quorum, i.e. which may be regarded + * committed. + * + * Zero may be returned when there is no quorum of nodes recovered to term start + * lsn which sent feedback yet. + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +{ + XLogRecPtr committed; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to + * propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack + * flushLsn is 0, and later we just preserve value across + * reconnections. It would be ok to check for SS_ACTIVE as well. + */ + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[wp->n_safekeepers - wp->quorum]; + } + + committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers); + if (wp->mconf.new_members.len > 0) + { + XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers); + + committed = Min(committed, new_mset_committed); + } + return committed; } /* @@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp) int i; XLogRecPtr donor_lsn = InvalidXLogRecPtr; - if (wp->n_votes < wp->quorum) + if (wp->state < WPS_ELECTED) { wp_log(WARNING, "UpdateDonorShmem called before elections are won"); return; @@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp) * about its position immediately after election before any feedbacks are * sent. */ - if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED) + if (wp->donor->state >= SS_WAIT_ELECTED) { - donor = &wp->safekeeper[wp->donor]; + donor = wp->donor; donor_lsn = wp->propTermStartLsn; } @@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } /* - * Generally sync is done when majority switched the epoch so we committed - * epochStartLsn and made the majority aware of it, ensuring they are - * ready to give all WAL to pageserver. It would mean whichever majority - * is alive, there will be at least one safekeeper who is able to stream - * WAL to pageserver to make basebackup possible. However, since at the - * moment we don't have any good mechanism of defining the healthy and - * most advanced safekeeper who should push the wal into pageserver and + * Generally sync is done when majority reached propTermStartLsn so we + * committed it and made the majority aware of it, ensuring they are ready + * to give all WAL to pageserver. It would mean whichever majority is + * alive, there will be at least one safekeeper who is able to stream WAL + * to pageserver to make basebackup possible. However, since at the moment + * we don't have any good mechanism of defining the healthy and most + * advanced safekeeper who should push the wal into pageserver and * basically the random one gets connected, to prevent hanging basebackup * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. @@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) n_synced++; } - if (n_synced >= wp->quorum) + if (newCommitLsn >= wp->propTermStartLsn) { /* A quorum of safekeepers has been synced! */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d116bce806..648b0015ad 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -145,6 +145,7 @@ typedef uint64 NNodeId; * This and following structs pair ones in membership.rs. */ typedef uint32 Generation; +#define INVALID_GENERATION 0 typedef struct SafekeeperId { @@ -771,7 +772,17 @@ typedef struct WalProposer /* Current walproposer membership configuration */ MembershipConfiguration mconf; - /* (n_safekeepers / 2) + 1 */ + /* + * Parallels mconf.members with pointers to the member's slot in + * safekeepers array of connections, or NULL if such member is not + * connected. Helps to avoid looking slot per id through all + * .safekeepers[] when doing quorum checks. + */ + Safekeeper *members_safekeepers[MAX_SAFEKEEPERS]; + /* As above, but for new_members. */ + Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS]; + + /* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */ int quorum; /* @@ -829,7 +840,7 @@ typedef struct WalProposer term_t donorLastLogTerm; /* Most advanced acceptor */ - int donor; + Safekeeper *donor; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 003a75faa6..6e7c5d971d 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -31,6 +31,7 @@ pub async fn task_main_https( global_timelines: Arc, ) -> anyhow::Result<()> { let cert_resolver = ReloadingCertificateResolver::new( + "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 0d1dc8f8ee..fb4530d0d2 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,6 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, + TimelineImportRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request) -> Result, ApiError ForwardOutcome::NotForwarded(req) => req, }; - let state = get_state(&req); - json_response(StatusCode::OK, state.service.step_down().await) + // Spawn a background task: once we start stepping down, we must finish: if the client drops + // their request we should avoid stopping in some part-stepped-down state. + let handle = tokio::spawn(async move { + let state = get_state(&req); + state.service.step_down().await + }); + + let result = handle + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + json_response(StatusCode::OK, result) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { @@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE ) } +async fn handle_timeline_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let import_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}", + import_req.tenant_id, + import_req.timeline_id + ))); + } + + json_response( + StatusCode::OK, + state.service.timeline_import(import_req).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1949,6 +1991,16 @@ pub fn make_router( RequestName("debug_v1_tenant_locate"), ) }) + .post( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import", + |r| { + named_request_span( + r, + handle_timeline_import, + RequestName("debug_v1_timeline_import"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 1aa9ae10ae..9358c9da4d 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> { let https_listener = tcp_listener::bind(https_addr)?; let resolver = ReloadingCertificateResolver::new( + "main", &args.ssl_key_file, &args.ssl_cert_file, *args.ssl_cert_reload_period, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 2ef09cd2e3..0982e56155 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -61,7 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::sync::gate::Gate; +use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; use crate::background_node_operations::{ @@ -594,6 +594,8 @@ struct TenantShardSplitAbort { new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant _tenant_lock: TracingExclusiveGuard, + /// The reconciler gate for the duration of the split operation, and any included abort. + _gate: GateGuard, } #[derive(thiserror::Error, Debug)] @@ -1460,7 +1462,7 @@ impl Service { // Retry until shutdown: we must keep this request object alive until it is properly // processed, as it holds a lock guard that prevents other operations trying to do things // to the tenant while it is in a weird part-split state. - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { match self.abort_tenant_shard_split(&op).await { Ok(_) => break, Err(e) => { @@ -1473,9 +1475,12 @@ impl Service { // when we retry, so that the abort op will succeed. If the abort op is failing // for some other reason, we will keep retrying forever, or until a human notices // and does something about it (either fixing a pageserver or restarting the controller). - tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) - .await - .ok(); + tokio::time::timeout( + Duration::from_secs(5), + self.reconcilers_cancel.cancelled(), + ) + .await + .ok(); } } } @@ -1847,6 +1852,7 @@ impl Service { }; if insert { + let config = attach_req.config.clone().unwrap_or_default(); let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, @@ -1855,7 +1861,7 @@ impl Service { generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), @@ -1878,16 +1884,16 @@ impl Service { Ok(()) => { tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( + let mut shard = TenantShard::new( attach_req.tenant_shard_id, - TenantShard::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Attached(0), - None, - ), + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + None, ); + shard.config = config; + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert(attach_req.tenant_shard_id, shard); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); } } @@ -1972,11 +1978,12 @@ impl Service { .set_attached(scheduler, attach_req.node_id); tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", + "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}", attach_req.tenant_shard_id, tenant_shard.generation, // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)), + attach_req.config, ); // Trick the reconciler into not doing anything for this tenant: this helps @@ -4910,7 +4917,7 @@ impl Service { 1, 10, Duration::from_secs(5), - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -5161,6 +5168,11 @@ impl Service { ) .await; + let _gate = self + .reconcilers_gate + .enter() + .map_err(|_| ApiError::ShuttingDown)?; + let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -5188,6 +5200,7 @@ impl Service { new_shard_count, new_stripe_size, _tenant_lock, + _gate, }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); @@ -5527,7 +5540,10 @@ impl Service { "failpoint".to_string() ))); - failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + failpoint_support::sleep_millis_async!( + "shard-split-post-remote-sleep", + &self.reconcilers_cancel + ); tracing::info!( "Split {} into {}", @@ -5585,7 +5601,7 @@ impl Service { stripe_size, preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), }, - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -8670,9 +8686,24 @@ impl Service { failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); self.inner.write().unwrap().step_down(); - // TODO: would it make sense to have a time-out for this? - self.stop_reconciliations(StopReconciliationsReason::SteppingDown) - .await; + + // Wait for reconciliations to stop, or terminate this process if they + // fail to stop in time (this indicates a bug in shutdown) + tokio::select! { + _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => { + tracing::info!("Reconciliations stopped, proceeding with step down"); + } + _ = async { + failpoint_support::sleep_millis_async!("step-down-delay-timeout"); + tokio::time::sleep(Duration::from_secs(10)).await + } => { + tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process"); + + // The caller may proceed to act as leader when it sees this request fail: reduce the chance + // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state. + std::process::exit(1); + } + } let mut global_observed = GlobalObservedState::default(); let locked = self.inner.read().unwrap(); diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 099d0305ba..a23b9a4a02 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -12,13 +12,16 @@ use crate::persistence::{ use crate::safekeeper::Safekeeper; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::controller_api::{ + SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, +}; use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::membership::{MemberSet, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; +use utils::lsn::Lsn; use super::Service; @@ -298,6 +301,31 @@ impl Service { timeline_id, }) } + + /// Directly insert the timeline into the database without reconciling it with safekeepers. + /// + /// Useful if the timeline already exists on the specified safekeepers, + /// but we want to make it storage controller managed. + pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> { + let persistence = TimelinePersistence { + tenant_id: req.tenant_id.to_string(), + timeline_id: req.timeline_id.to_string(), + start_lsn: Lsn::INVALID.into(), + generation: 1, + sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(), + new_sk_set: None, + cplane_notified_generation: 1, + deleted_at: None, + }; + let inserted = self.persistence.insert_timeline(persistence).await?; + if inserted { + tracing::info!("imported timeline into db"); + } else { + tracing::info!("didn't import timeline into db, as it is already present in db"); + } + Ok(()) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d000dcb69f..9d4068b583 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import threading import time import uuid from collections import defaultdict +from collections.abc import Mapping from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -79,7 +80,12 @@ from fixtures.remote_storage import ( default_remote_storage, remote_storage_to_toml_dict, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + MembershipConfiguration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -1980,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils): tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, generation_override: int | None = None, + config: None | dict[str, Any] = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: body["generation_override"] = generation_override + if config is not None: + body["config"] = config response = self.request( "POST", @@ -2878,13 +2887,14 @@ class NeonPageserver(PgProtocol, LogUtils): self, immediate: bool = False, timeout_in_seconds: int | None = None, + extra_env_vars: dict[str, str] | None = None, ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start(timeout_in_seconds=timeout_in_seconds) + self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars) self.quiesce_tenants() def quiesce_tenants(self): @@ -2973,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils): to call into the pageserver HTTP client. """ client = self.http_client() - if generation is None: - generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) - elif override_storage_controller_generation: + if generation is None or override_storage_controller_generation: generation = self.env.storage_controller.attach_hook_issue( - tenant_id, self.id, generation + tenant_id, + self.id, + generation_override=generation if override_storage_controller_generation else None, + config=config, ) return client.tenant_attach( tenant_id, @@ -4291,28 +4302,29 @@ class Endpoint(PgProtocol, LogUtils): def respec_deep(self, **kwargs: Any) -> None: """ - Update the endpoint.json file taking into account nested keys. - It does one level deep update. Should enough for most cases. - Distinct method from respec() to do not break existing functionality. + Update the spec.json file taking into account nested keys. + Distinct method from respec() to not break existing functionality. NOTE: This method also updates the spec.json file, not endpoint.json. We need it because neon_local also writes to spec.json, so intended use-case is i) start endpoint with some config, ii) respec_deep(), iii) call reconfigure() to apply the changes. """ + + def update(curr, patch): + for k, v in patch.items(): + if isinstance(v, Mapping): + curr[k] = update(curr.get(k, {}), v) + else: + curr[k] = v + return curr + config_path = os.path.join(self.endpoint_path(), "spec.json") with open(config_path) as f: data_dict: dict[str, Any] = json.load(f) log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4)) - for key, value in kwargs.items(): - if isinstance(value, dict): - if key not in data_dict: - data_dict[key] = value - else: - data_dict[key] = {**data_dict[key], **value} - else: - data_dict[key] = value + update(data_dict, kwargs) with open(config_path, "w") as file: log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) @@ -4839,6 +4851,50 @@ class Safekeeper(LogUtils): wait_until(paused) + @staticmethod + def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]: + return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks] + + @staticmethod + def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]: + """ + List of Safekeepers which are members in `mconf`. + """ + members_ids = [m.id for m in mconf.members] + new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else [] + return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids] + + @staticmethod + def create_timeline( + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + mconf: MembershipConfiguration, + members_sks: list[Safekeeper], + ): + """ + Manually create timeline on safekeepers with given (presumably inital) + mconf: figure out LSN from pageserver, bake request and execute it on + given safekeepers. + + Normally done by storcon, but some tests want to do it manually so far. + """ + ps_http_cli = ps.http_client() + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk in members_sks: + sk.http_client().timeline_create(create_r) + class NeonBroker(LogUtils): """An object managing storage_broker instance""" diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index eedb693e3d..71c750b9eb 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -65,13 +65,11 @@ def single_timeline( assert ps_http.tenant_list() == [] def attach(tenant): - # NB: create the new tenant in the storage controller with the correct tenant config. This - # will pick up the existing tenant data from remote storage. If we just attach it to the - # Pageserver, the storage controller will reset the tenant config to the default. - env.create_tenant( - tenant_id=tenant, - timeline_id=template_timeline, - conf=template_config, + env.pageserver.tenant_attach( + tenant, + config=template_config, + generation=100, + override_storage_controller_generation=True, ) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index e409151b76..839e985419 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -25,7 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: - mconf: Configuration | None + mconf: MembershipConfiguration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -78,17 +78,17 @@ class SafekeeperId: @dataclass -class Configuration: +class MembershipConfiguration: generation: int members: list[SafekeeperId] new_members: list[SafekeeperId] | None @classmethod - def from_json(cls, d: dict[str, Any]) -> Configuration: + def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration: generation = d["generation"] members = d["members"] new_members = d.get("new_members") - return Configuration(generation, members, new_members) + return MembershipConfiguration(generation, members, new_members) def to_json(self) -> str: return json.dumps(self, cls=EnhancedJSONEncoder) @@ -98,7 +98,7 @@ class Configuration: class TimelineCreateRequest: tenant_id: TenantId timeline_id: TimelineId - mconf: Configuration + mconf: MembershipConfiguration # not exactly PgVersion, for example 150002 for 15.2 pg_version: int start_lsn: Lsn @@ -110,13 +110,13 @@ class TimelineCreateRequest: @dataclass class TimelineMembershipSwitchResponse: - previous_conf: Configuration - current_conf: Configuration + previous_conf: MembershipConfiguration + current_conf: MembershipConfiguration @classmethod def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: - previous_conf = Configuration.from_json(d["previous_conf"]) - current_conf = Configuration.from_json(d["current_conf"]) + previous_conf = MembershipConfiguration.from_json(d["previous_conf"]) + current_conf = MembershipConfiguration.from_json(d["current_conf"]) return TimelineMembershipSwitchResponse(previous_conf, current_conf) @@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] # It is always normally not None, it is allowed only to make forward compat tests happy. - mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None + mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( mconf=mconf, term=resj["acceptor_state"]["term"], @@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return self.timeline_status(tenant_id, timeline_id).commit_lsn # Get timeline membership configuration. - def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + def get_membership( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> MembershipConfiguration: # make mypy happy return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore @@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def timeline_exclude( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> dict[str, Any]: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", @@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def membership_switch( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> TimelineMembershipSwitchResponse: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index efd423104d..8af52dcbd0 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count( n_txns = 500000 - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, n_txns) - env = setup_pageserver_with_tenants( - neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + neon_env_builder, + f"large_slru_count-{n_tenants}-{n_txns}", + n_tenants, + lambda env: setup_tenant_template(env, n_txns), ) run_benchmark(env, pg_bin, record, duration) @@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } template_tenant, template_timeline = env.create_tenant(set_default=True) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index 2c27368001..5169add6cb 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -1,5 +1,8 @@ +import concurrent.futures import dataclasses import json +import re +import threading import time from dataclasses import dataclass from pathlib import Path @@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures", "tasks"] +EXECUTION = ["concurrent-futures"] NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) -BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 2, 4, 8, 16, 32]: +BATCHABLE: list[PageServicePipeliningConfig] = [] +for max_batch_size in [32]: for execution in EXECUTION: BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) @@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]: @pytest.mark.parametrize( "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # non-batchable workloads - # (A separate benchmark will consider latency). - *[ - ( - 50, - config, - TARGET_RUNTIME, - 1, - 128, - f"not batchable {dataclasses.asdict(config)}", - ) - for config in NON_BATCHABLE - ], # batchable workloads should show throughput and CPU efficiency improvements *[ ( @@ -137,7 +127,14 @@ def test_throughput( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # minimal lfc & small shared buffers to force requests to pageserver + "neon.max_file_cache_size=1MB", + "shared_buffers=10MB", + ], + ) conn = endpoint.connect() cur = conn.cursor() @@ -155,7 +152,6 @@ def test_throughput( tablesize = tablesize_mib * 1024 * 1024 npages = tablesize // (8 * 1024) cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? # # Run the workload, collect `Metrics` before and after, calculate difference, normalize. @@ -211,31 +207,73 @@ def test_throughput( ).value, ) - def workload() -> Metrics: + def workload(disruptor_started: threading.Event) -> Metrics: + disruptor_started.wait() start = time.time() iters = 0 while time.time() - start < target_runtime or iters < 2: - log.info("Seqscan %d", iters) if iters == 1: # round zero for warming up before = get_metrics() - cur.execute( - "select clear_buffer_cache()" - ) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests cur.execute("select sum(data::bigint) from t") assert cur.fetchall()[0][0] == npages * (npages + 1) // 2 iters += 1 after = get_metrics() return (after - before).normalize(iters - 1) + def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event): + conn = endpoint.connect() + cur = conn.cursor() + iters = 0 + while True: + cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')") + if stop_disruptor.is_set(): + break + disruptor_started.set() + iters += 1 + time.sleep(0.001) + return iters + env.pageserver.patch_config_toml_nonrecursive( {"page_service_pipelining": dataclasses.asdict(pipelining_config)} ) - env.pageserver.restart() - metrics = workload() + + # set trace for log analysis below + env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"}) + + log.info("Starting workload") + + with concurrent.futures.ThreadPoolExecutor() as executor: + disruptor_started = threading.Event() + stop_disruptor = threading.Event() + disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor) + workload_fut = executor.submit(workload, disruptor_started) + metrics = workload_fut.result() + stop_disruptor.set() + ndisruptions = disruptor_fut.result() + log.info("Disruptor issued %d disrupting requests", ndisruptions) log.info("Results: %s", metrics) + since_last_start: list[str] = [] + for line in env.pageserver.logfile.read_text().splitlines(): + if "git:" in line: + since_last_start = [] + since_last_start.append(line) + + stopping_batching_because_re = re.compile( + r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)" + ) + reasons_for_stopping_batching = {} + for line in since_last_start: + match = stopping_batching_because_re.search(line) + if match: + if match.group(1) not in reasons_for_stopping_batching: + reasons_for_stopping_batching[match.group(1)] = 0 + reasons_for_stopping_batching[match.group(1)] += 1 + + log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching) + # # Sanity-checks on the collected data # diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 3d7204d883..5ef63e2fe9 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import json import uuid from typing import TYPE_CHECKING @@ -16,7 +15,6 @@ if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder, PgBin -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395") def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() @@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), "max": int(rate_limit_rps / 10), - "fair": True, }, }, ) @@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") - duration_secs = 10 + duration_secs = 20 actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) log.info("validate the client is capped at the configured rps limit") expect_ncompleted = duration_secs * rate_limit_rps - delta_abs = abs(expect_ncompleted - actual_ncompleted) - threshold = 0.05 * expect_ncompleted - assert threshold / rate_limit_rps < 0.1 * duration_secs, ( - "test self-test: unrealistic expecations regarding precision in this test" - ) - assert delta_abs < 0.05 * expect_ncompleted, ( + assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, ( "the throttling deviates more than 5percent from the expectation" ) @@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P timeout=compaction_period, ) + log.info("validate the metrics") smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) @@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre actual_throttled_secs = actual_throttled_usecs / 1_000_000 - log.info("validate that the metric doesn't include throttle wait time") - assert duration_secs >= 10 * actual_smgr_query_seconds, ( - "smgr metrics should not include throttle wait time" - ) - - log.info("validate that the throttling wait time metrics is correct") assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( - "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + "throttling and processing latency = total request time; this assert validates thi holds on average" ) - -throttle_config_with_field_fair_set = { - "task_kinds": ["PageRequestHandler"], - "fair": True, - "initial": 27, - "refill_interval": "43s", - "refill_amount": 23, - "max": 42, -} - - -def assert_throttle_config_with_field_fair_set(conf): - """ - Field `fair` is ignored, so, responses don't contain it - """ - without_fair = copy.deepcopy(throttle_config_with_field_fair_set) - without_fair.pop("fair") - - assert conf == without_fair - - -def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - env = neon_env_builder.init_start() - vps_http = env.storage_controller.pageserver_api() - # with_fair config should still be settable - vps_http.set_tenant_config( - env.initial_tenant, - {"timeline_get_throttle": throttle_config_with_field_fair_set}, - ) - conf = vps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - assert_throttle_config_with_field_fair_set( - conf.tenant_specific_overrides["timeline_get_throttle"] - ) - - -def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( - neon_env_builder: NeonEnvBuilder, -): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - - def set_tenant_config(ps_cfg): - tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set - - neon_env_builder.pageserver_config_override = set_tenant_config - env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() - conf = ps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - - env.pageserver.allowed_errors.append( - r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*' + # without this assertion, the test would pass even if the throttling was completely broken + # but the request processing is so slow that it makes up for the latency that a correct throttling + # implementation would add + assert actual_smgr_query_seconds < 0.66 * duration_secs, ( + "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling" ) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index d03d05d33d..d48e731394 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, pageserver.tenant_location_configure(tenant_id, location_conf) last_state[pageserver.id] = (mode, generation) - if mode.startswith("Attached"): + # It's only valid to connect to the last generation. Newer generations may yank layer + # files used in older generations. + last_generation = max( + [s[1] for s in last_state.values() if s[1] is not None], default=None + ) + + if mode.startswith("Attached") and generation == last_generation: # This is a basic test: we are validating that he endpoint works properly _between_ # configuration changes. A stronger test would be to validate that clients see # no errors while we are making the changes. diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 9a7204ca17..39c94c05a9 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,6 @@ import os import ssl +from datetime import datetime, timedelta import pytest import requests @@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder): requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() cur_cert = ssl.get_server_certificate(("localhost", port)) assert cur_cert == sk_cert + + +def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder): + """ + Test metrics exported from http/https server and tls cert reloader. + """ + neon_env_builder.use_https_pageserver_api = True + neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'" + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*Error reloading certificate.*") + + ps_client = env.pageserver.http_client() + + # 1. Test connection started metric. + filter_https = {"scheme": "https"} + old_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + new_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + # The counter should increase after the request, + # but it may increase by more than one because of storcon requests. + assert new_https_conn_count > old_https_conn_count + + # 2. Test tls connection error. + # Request without specified CA cert file should fail. + with pytest.raises(requests.exceptions.SSLError): + requests.get(addr) + + tls_error_cnt = ( + ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0 + ) + assert tls_error_cnt == 1 + + # 3. Test expiration time metric. + expiration_time = datetime.fromtimestamp( + ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0 + ) + now = datetime.now() + # neon_local generates certs valid for 100 years. + # Compare with +-1 year to not care about leap years. + assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101) + + # 4. Test cert reload failed metric. + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") + assert reload_error_cnt == 0 + + os.remove(env.pageserver.workdir / "server.crt") + + def reload_failed(): + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0 + assert reload_error_cnt > 0 + + wait_until(reload_failed) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index ce73c9a738..b2c8415e9a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer( ) +@pytest.mark.parametrize("step_down_times_out", [False, True]) def test_storage_controller_leadership_transfer_during_split( neon_env_builder: NeonEnvBuilder, storage_controller_proxy: StorageControllerProxy, port_distributor: PortDistributor, + step_down_times_out: bool, ): """ Exercise a race between shard splitting and graceful leadership transfer. This is @@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split( ) env.storage_controller.reconcile_until_idle() + # We are testing scenarios where the step down API does not complete: either because it is stuck + # doing a shard split, or because it totally times out on some other failpoint. + env.storage_controller.allowed_errors.extend( + [ + ".*step_down.*request was dropped before completing.*", + ".*step_down.*operation timed out.*", + ".*Send step down request failed, will retry.*", + ".*Send step down request still failed after.*retries.*", + ".*Leader .+ did not respond to step-down request.*", + ] + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # Start a shard split env.storage_controller.allowed_errors.extend( @@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split( ) pause_failpoint = "shard-split-pre-complete" env.storage_controller.configure_failpoints((pause_failpoint, "pause")) + + if not step_down_times_out: + # Prevent the timeout self-terminate code from executing: we will block step down on the + # shard split itself + env.storage_controller.configure_failpoints( + ("step-down-delay-timeout", "return(3600000)") + ) + split_fut = executor.submit( env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2 ) @@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split( timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port ) + if step_down_times_out: + # Step down will time out, original controller will terminate itself + env.storage_controller.allowed_errors.extend([".*terminating process.*"]) + else: + # Step down does not time out: original controller hits its shard split completion + # code path and realises that it must not purge the parent shards from the database. + env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"]) + def passed_split_abort(): try: log.info("Checking log for pattern...") - assert env.storage_controller.log_contains( - ".*Using observed state received from leader.*" - ) + # This log is indicative of entering startup_reconcile, which happens + # after the point we would abort shard splits + assert env.storage_controller.log_contains(".*Populating tenant shards.*") except Exception: log.exception("Failed to find pattern in log") raise @@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split( wait_until(passed_split_abort, interval=0.1, status_interval=1.0) assert env.storage_controller.log_contains(".*Aborting shard split.*") - # Proxy is still talking to original controller here: disable its pause failpoint so - # that its shard split can run to completion. - log.info("Disabling failpoint") - # Bypass the proxy: the python test HTTPServer is single threaded and still blocked - # on handling the shard split request. - env.storage_controller.request( - "PUT", - f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", - json=[{"name": "shard-split-pre-complete", "actions": "off"}], - headers=env.storage_controller.headers(TokenScope.ADMIN), - ) + if step_down_times_out: + # We will let the old controller hit a timeout path where it terminates itself, rather than + # completing step_down and trying to complete a shard split + def old_controller_terminated(): + assert env.storage_controller.log_contains(".*terminating process.*") - def previous_stepped_down(): - assert ( - env.storage_controller.get_leadership_status() - == StorageControllerLeadershipStatus.STEPPED_DOWN + wait_until(old_controller_terminated) + else: + # Proxy is still talking to original controller here: disable its pause failpoint so + # that its shard split can run to completion. + log.info("Disabling failpoint") + # Bypass the proxy: the python test HTTPServer is single threaded and still blocked + # on handling the shard split request. + env.storage_controller.request( + "PUT", + f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", + json=[{"name": "shard-split-pre-complete", "actions": "off"}], + headers=env.storage_controller.headers(TokenScope.ADMIN), ) - log.info("Awaiting step down") - wait_until(previous_stepped_down) + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) - # Let the shard split complete: this may happen _after_ the replacement has come up - # and tried to clean up the databases - log.info("Unblocking & awaiting shard split") - with pytest.raises(Exception, match="Unexpected child shard count"): - # This split fails when it tries to persist results, because it encounters - # changes already made by the new controller's abort-on-startup - split_fut.result() + log.info("Awaiting step down") + wait_until(previous_stepped_down) + + # Let the shard split complete: this may happen _after_ the replacement has come up + # and tried to clean up the databases + log.info("Unblocking & awaiting shard split") + with pytest.raises(Exception, match="Unexpected child shard count"): + # This split fails when it tries to persist results, because it encounters + # changes already made by the new controller's abort-on-startup + split_fut.result() log.info("Routing to new leader") storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() - # Check that the stepped down instance forwards requests - # to the new leader while it's still running. - storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_shard_dump() - env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) - status = env.storage_controller.node_status(env.pageservers[0].id) - assert status["scheduling"] == "Pause" + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_shard_dump() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index e3d39f9315..a9a6699e5c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -45,7 +45,7 @@ from fixtures.remote_storage import ( s3_storage, ) from fixtures.safekeeper.http import ( - Configuration, + MembershipConfiguration, SafekeeperHttpClient, SafekeeperId, TimelineCreateRequest, @@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re for sk in env.safekeepers: sk.start() cli = sk.http_client() - mconf = Configuration(generation=0, members=[], new_members=None) + mconf = MembershipConfiguration(generation=0, members=[], new_members=None) # set start_lsn to the beginning of the first segment to allow reading # WAL from there (could you intidb LSN as well). r = TimelineCreateRequest( @@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. - init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, init_conf) @@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): http_cli.timeline_create(create_r) # Switch into some conf. - joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) log.info(f"joint switch resp: {resp}") assert resp.previous_conf.generation == 1 @@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): assert after_restart.generation == 4 # Switch into non joint conf of which sk is not a member, must fail. - non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + non_joint_not_member = MembershipConfiguration( + generation=5, members=[sk_id_2], new_members=None + ) with pytest.raises(requests.exceptions.HTTPError): resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) # Switch into good non joint conf. - non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) + non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 assert resp.current_conf.generation == 6 # Switch request to lower conf should be rejected. - lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, lower_conf) # Now, exclude sk from the membership, timeline should be deleted. - excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None) http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) with pytest.raises(requests.exceptions.HTTPError): http_cli.timeline_status(tenant_id, timeline_id) @@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - ps = env.pageservers[0] - ps_http_cli = ps.http_client() - - http_clis = [sk.http_client() for sk in env.safekeepers] - config_lines = [ "neon.safekeeper_proto_version = 3", ] @@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): # expected to fail because timeline is not created on safekeepers with pytest.raises(Exception, match=r".*timed out.*"): ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") - # figure out initial LSN. - ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) - init_lsn = ps_timeline_detail["last_record_lsn"] - log.info(f"initial LSN: {init_lsn}") - # sk timeline creation request expects minor version - pg_version = ps_timeline_detail["pg_version"] * 10000 # create inital mconf - sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] - mconf = Configuration(generation=1, members=sk_ids, new_members=None) - create_r = TimelineCreateRequest( - tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + mconf = MembershipConfiguration( + generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None ) - log.info(f"sending timeline create: {create_r.to_json()}") - - for sk_http_cli in http_clis: - sk_http_cli.timeline_create(create_r) + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers) # Once timeline created endpoint should start. ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b7c7478e78..c5dd34f64f 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind +from fixtures.safekeeper.http import MembershipConfiguration from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: @@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): asyncio.run(run_concurrent_computes(env)) +async def assert_query_hangs(endpoint: Endpoint, query: str): + """ + Start on endpoint query which is expected to hang and check that it does. + """ + conn = await endpoint.connect_async() + bg_query = asyncio.create_task(conn.execute(query)) + await asyncio.sleep(2) + assert not bg_query.done() + return bg_query + + # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability( - sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 -): +async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2): # shutdown one of two acceptors, that is, majority sk.stop() - bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) - - await asyncio.sleep(start_delay_sec) - # ensure that the query has not been executed yet - assert not bg_query.done() - + bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')") # start safekeeper and await the query sk.start() await bg_query @@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint): await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down - await check_unavailability(env.safekeepers[0], conn, 2) + await check_unavailability(env.safekeepers[0], endpoint, 2) # for the world's balance, do the same with second safekeeper - await check_unavailability(env.safekeepers[1], conn, 3) + await check_unavailability(env.safekeepers[1], endpoint, 3) # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") @@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv): # insert with only one safekeeper up to create tail of flushed but not committed WAL sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 2000), 'payload'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): # insert with only one sk3 up to create tail of flushed but not committed WAL on it sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) +async def quorum_sanity_single( + env: NeonEnv, + compute_sks_ids: list[int], + members_sks_ids: list[int], + new_members_sks_ids: list[int] | None, + sks_to_stop_ids: list[int], + should_work_when_stopped: bool, +): + """ + *_ids params contain safekeeper node ids; it is assumed they are issued + from 1 and sequentially assigned to env.safekeepers. + """ + members_sks = [env.safekeepers[i - 1] for i in members_sks_ids] + new_members_sks = ( + [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None + ) + sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids] + + mconf = MembershipConfiguration( + generation=1, + members=Safekeeper.sks_to_safekeeper_ids(members_sks), + new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None, + ) + members_sks = Safekeeper.mconf_sks(env, mconf) + + tenant_id = env.initial_tenant + compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids]) + members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members]) + new_members_sks_ids_str = "-".join( + [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else [] + ) + sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop]) + log.info( + f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}" + ) + branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}" + timeline_id = env.create_branch(branch_name) + + # create timeline on `members_sks` + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks) + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create(branch_name, config_lines=config_lines) + ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids) + ep.safe_psql("create table t(key int, value text)") + + # stop specified sks and check whether writes work + for sk in sks_to_stop: + sk.stop() + if should_work_when_stopped: + log.info("checking that writes still work") + ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'") + # restarting ep should also be fine + ep.stop() + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'plum'") + bg_query = None + else: + log.info("checking that writes hang") + bg_query = await assert_query_hangs( + ep, "insert into t select generate_series(1, 100), 'Papaya'" + ) + # start again; now they should work + for sk in sks_to_stop: + sk.start() + if bg_query: + log.info("awaiting query") + await bg_query + + +# It's a bit tempting to iterate over all possible combinations, but let's stick +# with this for now. +async def run_quorum_sanity(env: NeonEnv): + # 3 members, all up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True) + # 3 members, 1/3 up, should not work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False) + + # 3 members, all up, should work; wp redundantly talks to 4th. + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False) + + # joint conf of 1-2-3 and 4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True) + # joint conf of 1-2-3 and 4, 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False) + + # joint conf of 1-2-3 and 2-3-4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True) + # joint conf of 1-2-3 and 2-3-4, 2 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True) + # joint conf of 1-2-3 and 2-3-4, 3 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False) + # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False) + + # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only. + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # with 1 down should still be ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True) + # but with 2 down not ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False) + + +# Test various combinations of membership configurations / neon.safekeepers +# (list of safekeepers endpoint connects to) values / up & down safekeepers and +# check that endpont can start and write data when we have quorum and can't when +# we don't. +def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + + asyncio.run(run_quorum_sanity(env)) + + async def run_segment_init_failure(env: NeonEnv): env.create_branch("test_segment_init_failure") ep = env.endpoints.create_start("test_segment_init_failure")