Merge 2025-04-11 main commit 'c66444ea1538349d13ab5e87bca880394434004b' into yuchen/direct-io-delta-image-layer-write

This commit is contained in:
Christian Schwarz
2025-04-11 16:26:41 +02:00
44 changed files with 1694 additions and 519 deletions

1
Cargo.lock generated
View File

@@ -2837,6 +2837,7 @@ dependencies = [
"utils",
"uuid",
"workspace_hack",
"x509-cert",
]
[[package]]

View File

@@ -523,11 +523,14 @@ impl ComputeNode {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id,
pspec.timeline_id,
pspec.spec.project_id.as_deref().unwrap_or("None"),
pspec.spec.branch_id.as_deref().unwrap_or("None"),
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
pspec.spec.features,
pspec.spec.remote_extensions,
);
@@ -631,19 +634,23 @@ impl ComputeNode {
});
}
// Configure and start rsyslog for HIPAA if necessary
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
// Configure and start rsyslog for compliance audit logging
match pspec.spec.audit_log_level {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
let remote_endpoint =
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
if remote_endpoint.is_empty() {
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
}
let log_directory_path = Path::new(&self.params.pgdata).join("log");
let log_directory_path = log_directory_path.to_string_lossy().to_string();
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
// Launch a background task to clean up the audit logs
launch_pgaudit_gc(log_directory_path);
_ => {}
}
// Configure and start rsyslog for Postgres logs export

View File

@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
escape_conf_value(&s.to_string())
)?;
}
if let Some(s) = &spec.project_id {
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.branch_id {
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.endpoint_id {
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
}
// tls
if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
// and don't allow the user or the control plane admin to change them.
match spec.audit_log_level {
ComputeAudit::Disabled => {}
ComputeAudit::Log => {
ComputeAudit::Log | ComputeAudit::Base => {
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
writeln!(file, "pgaudit.log='ddl,role'")?;
// Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
}
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
}
ComputeAudit::Hipaa => {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
writeln!(
file,
"# Managed by compute_ctl compliance audit settings: begin"
)?;
// This log level is very verbose
// but this is necessary for HIPAA compliance.
// Exclude 'misc' category, because it doesn't contain anythig relevant.
writeln!(file, "pgaudit.log='all, -misc'")?;
writeln!(file, "pgaudit.log_parameter=on")?;
// Enable logging of parameters.
// This is very verbose and may contain sensitive data.
if spec.audit_log_level == ComputeAudit::Full {
writeln!(file, "pgaudit.log_parameter=on")?;
writeln!(file, "pgaudit.log='all'")?;
} else {
writeln!(file, "pgaudit.log_parameter=off")?;
writeln!(file, "pgaudit.log='all, -misc'")?;
}
// Disable logging of catalog queries
// The catalog doesn't contain sensitive data, so we don't need to audit it.
writeln!(file, "pgaudit.log_catalog=off")?;

View File

@@ -278,12 +278,12 @@ impl ComputeNode {
// so that all config operations are audit logged.
match spec.audit_log_level
{
ComputeAudit::Hipaa => {
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
phases.push(CreatePgauditExtension);
phases.push(CreatePgauditlogtofileExtension);
phases.push(DisablePostgresDBPgAudit);
}
ComputeAudit::Log => {
ComputeAudit::Log | ComputeAudit::Base => {
phases.push(CreatePgauditExtension);
phases.push(DisablePostgresDBPgAudit);
}

View File

@@ -658,6 +658,9 @@ impl Endpoint {
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
project_id: None,
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),

View File

@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
TenantCreateResponse, TenantLocateResponse,
};
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
use pageserver_api::models::{
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
};
use pageserver_api::shard::TenantShardId;
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: Option<NodeId>,
pub generation_override: Option<i32>,
pub generation_override: Option<i32>, // only new tenants
pub config: Option<TenantConfig>, // only new tenants
}
#[derive(Serialize, Deserialize)]
@@ -805,6 +808,7 @@ impl StorageController {
tenant_shard_id,
node_id: Some(pageserver_id),
generation_override: None,
config: None,
};
let response = self

View File

@@ -104,6 +104,12 @@ pub struct ComputeSpec {
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
// More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs.
pub project_id: Option<String>,
pub branch_id: Option<String>,
pub endpoint_id: Option<String>,
/// Safekeeper membership config generation. It is put in
/// neon.safekeepers GUC and serves two purposes:
/// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
#[serde(default)] // Default false
pub drop_subscriptions_before_start: bool,
/// Log level for audit logging:
///
/// Disabled - no audit logging. This is the default.
/// log - log masked statements to the postgres log using pgaudit extension
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
///
/// Extensions should be present in shared_preload_libraries
/// Log level for compute audit logging
#[serde(default)]
pub audit_log_level: ComputeAudit,
@@ -289,14 +289,25 @@ impl ComputeMode {
}
/// Log level for audit logging
/// Disabled, log, hipaa
/// Default is Disabled
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
#[default]
Disabled,
// Deprecated, use Base instead
Log,
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
// logged to the standard postgresql log stream
Base,
// Deprecated, use Full or Extended instead
Hipaa,
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access
Extended,
// (pgaudit.log='all', pgaudit.log_parameter='on'),
// logged to separate files collected by rsyslog
// into dedicated log storage with strict access.
Full,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]

View File

@@ -30,6 +30,7 @@ tokio.workspace = true
tracing.workspace = true
url.workspace = true
uuid.workspace = true
x509-cert.workspace = true
# to use tokio channels as streams, this is faster to compile than async_stream
# why is it only here? no other crate should use it, streams are rarely needed.

View File

@@ -4,6 +4,8 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use hyper0::Body;
use hyper0::server::conn::Http;
use metrics::{IntCounterVec, register_int_counter_vec};
use once_cell::sync::Lazy;
use routerify::{RequestService, RequestServiceBuilder};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
tls_acceptor: Option<TlsAcceptor>,
}
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_started_total",
"Number of established http/https connections",
&["scheme"]
)
.expect("failed to define a metric")
});
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"http_server_connection_errors_total",
"Number of occured connection errors by type",
&["type"]
)
.expect("failed to define a metric")
});
impl Server {
pub fn new(
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
false
}
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
let mut connections = FuturesUnordered::new();
loop {
tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
let (tcp_stream, remote_addr) = match stream {
Ok(stream) => stream,
Err(err) => {
tcp_error_cnt.inc();
if !suppress_io_error(&err) {
info!("Failed to accept TCP connection: {err:#}");
}
@@ -78,11 +108,18 @@ impl Server {
let tls_acceptor = self.tls_acceptor.clone();
let cancel = cancel.clone();
let tls_error_cnt = tls_error_cnt.clone();
let http_error_cnt = http_error_cnt.clone();
let https_error_cnt = https_error_cnt.clone();
let http_connection_cnt = http_connection_cnt.clone();
let https_connection_cnt = https_connection_cnt.clone();
connections.push(tokio::spawn(
async move {
match tls_acceptor {
Some(tls_acceptor) => {
// Handle HTTPS connection.
https_connection_cnt.inc();
let tls_stream = tokio::select! {
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
_ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
let tls_stream = match tls_stream {
Ok(tls_stream) => tls_stream,
Err(err) => {
tls_error_cnt.inc();
if !suppress_io_error(&err) {
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
}
@@ -97,6 +135,7 @@ impl Server {
}
};
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
https_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
}
@@ -104,7 +143,9 @@ impl Server {
}
None => {
// Handle HTTP connection.
http_connection_cnt.inc();
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
http_error_cnt.inc();
if !suppress_hyper_error(&err) {
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
}
@@ -115,6 +156,7 @@ impl Server {
}
Some(conn) = connections.next() => {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}
@@ -122,6 +164,7 @@ impl Server {
// Wait for graceful shutdown of all connections.
while let Some(conn) = connections.next().await {
if let Err(err) = conn {
panic_error_cnt.inc();
error!("Connection panicked: {err:#}");
}
}

View File

@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
use anyhow::Context;
use arc_swap::ArcSwap;
use camino::Utf8Path;
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
use once_cell::sync::Lazy;
use rustls::{
pki_types::{CertificateDer, PrivateKeyDer},
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
server::{ClientHello, ResolvesServerCert},
sign::CertifiedKey,
};
use x509_cert::der::Reader;
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
Ok(certified_key)
}
/// rustls's CertifiedKey with extra parsed fields used for metrics.
struct ParsedCertifiedKey {
certified_key: CertifiedKey,
expiration_time: UnixTime,
}
/// Parse expiration time from an X509 certificate.
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
let parsed_cert = x509_cert::der::SliceReader::new(cert)
.context("Failed to parse cerficiate")?
.decode::<x509_cert::Certificate>()
.context("Failed to parse cerficiate")?;
Ok(UnixTime::since_unix_epoch(
parsed_cert
.tbs_certificate
.validity
.not_after
.to_unix_duration(),
))
}
async fn load_and_parse_certified_key(
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
) -> anyhow::Result<ParsedCertifiedKey> {
let certified_key = load_certified_key(key_filename, cert_filename).await?;
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
Ok(ParsedCertifiedKey {
certified_key,
expiration_time,
})
}
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"tls_certs_expiration_time_seconds",
"Expiration time of the loaded certificate since unix epoch in seconds",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_started_total",
"Number of certificate reload loop iterations started",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_updated_total",
"Number of times the certificate was updated to the new one",
&["resolver_name"]
)
.expect("failed to define a metric")
});
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"tls_certs_reload_failed_total",
"Number of times the certificate reload failed",
&["resolver_name"]
)
.expect("failed to define a metric")
});
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
/// the disk periodically.
#[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
impl ReloadingCertificateResolver {
/// Creates a new Resolver by loading certificate and private key from FS and
/// creating tokio::task to reload them with provided reload_period.
/// resolver_name is used as metric's label.
pub async fn new(
resolver_name: &str,
key_filename: &Utf8Path,
cert_filename: &Utf8Path,
reload_period: Duration,
) -> anyhow::Result<Arc<Self>> {
// Create metrics for current resolver.
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
let cert_reload_started_counter =
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_updated_counter =
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
let cert_reload_failed_counter =
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
let this = Arc::new(Self {
certified_key: ArcSwap::from_pointee(
load_certified_key(key_filename, cert_filename).await?,
),
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
});
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
tokio::spawn({
let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
Some(this) => this,
None => break, // Resolver has been destroyed, exit.
};
match load_certified_key(&key_filename, &cert_filename).await {
Ok(new_certified_key) => {
if new_certified_key.cert == this.certified_key.load().cert {
cert_reload_started_counter.inc();
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
Ok(parsed_key) => {
if parsed_key.certified_key.cert == this.certified_key.load().cert {
tracing::debug!("Certificate has not changed since last reloading");
} else {
tracing::info!("Certificate has been reloaded");
this.certified_key.store(Arc::new(new_certified_key));
this.certified_key.store(Arc::new(parsed_key.certified_key));
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
cert_reload_updated_counter.inc();
}
last_reload_failed = false;
}
Err(err) => {
cert_reload_failed_counter.inc();
// Note: Reloading certs may fail if it conflicts with the script updating
// the files at the same time. Warn only if the error is persistent.
if last_reload_failed {

View File

@@ -180,6 +180,7 @@ pub struct ConfigToml {
#[serde(skip_serializing_if = "Option::is_none")]
pub generate_unarchival_heatmap: Option<bool>,
pub tracing: Option<Tracing>,
pub enable_tls_page_service_api: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
load_previous_heatmap: None,
generate_unarchival_heatmap: None,
tracing: None,
enable_tls_page_service_api: false,
}
}
}

View File

@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
/// Import request for safekeeper timelines.
#[derive(Serialize, Deserialize, Clone)]
pub struct TimelineImportRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub start_lsn: Lsn,
pub sk_set: Vec<NodeId>,
}
#[cfg(test)]
mod test {
use serde_json;

View File

@@ -927,7 +927,7 @@ impl Key {
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
#[inline(always)]
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
Ok(match self.field1 {
0x00 => (
RelTag {
@@ -938,7 +938,7 @@ impl Key {
},
self.field6,
),
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
_ => return Err(ToRelBlockError(self.field1)),
})
}
}
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
}
}
#[derive(Debug)]
pub struct ToRelBlockError(u8);
impl fmt::Display for ToRelBlockError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "unexpected value kind 0x{:02x}", self.0)
}
}
impl std::error::Error for ToRelBlockError {}
#[cfg(test)]
mod tests {
use std::str::FromStr;

View File

@@ -452,6 +452,24 @@ fn start_pageserver(
info!("Using auth for http API: {:#?}", conf.http_auth_type);
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
{
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
"main",
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
Some(Arc::new(server_config))
} else {
None
};
match var("NEON_AUTH_TOKEN") {
Ok(v) => {
info!("Loaded JWT token for authentication with Safekeeper");
@@ -670,17 +688,11 @@ fn start_pageserver(
let https_task = match https_listener {
Some(https_listener) => {
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,
))?;
let tls_server_config = tls_server_config
.clone()
.expect("tls_server_config is set earlier if https is enabled");
let server_config = rustls::ServerConfig::builder()
.with_no_client_auth()
.with_cert_resolver(resolver);
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
let server =
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -736,6 +748,11 @@ fn start_pageserver(
tokio::net::TcpListener::from_std(pageserver_listener)
.context("create tokio listener")?
},
if conf.enable_tls_page_service_api {
tls_server_config
} else {
None
},
);
// All started up! Now just sit and wait for shutdown signal.

View File

@@ -219,6 +219,11 @@ pub struct PageServerConf {
pub generate_unarchival_heatmap: bool,
pub tracing: Option<pageserver_api::config::Tracing>,
/// Enable TLS in page service API.
/// Does not force TLS: the client negotiates TLS usage during the handshake.
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
pub enable_tls_page_service_api: bool,
}
/// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
load_previous_heatmap,
generate_unarchival_heatmap,
tracing,
enable_tls_page_service_api,
} = config_toml;
let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
page_service_pipelining,
get_vectored_concurrent_io,
tracing,
enable_tls_page_service_api,
// ------------------------------------------------------------
// fields that require additional validation or custom handling

View File

@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
modification
.put_file(&fname, content.as_bytes(), &ctx)
.await
.map_err(ApiError::InternalServerError)?;
.map_err(|e| ApiError::InternalServerError(e.into()))?;
}
modification
.commit(&ctx)

View File

@@ -27,7 +27,7 @@ use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::Timeline;
use crate::walingest::WalIngest;
use crate::walingest::{WalIngest, WalIngestErrorKind};
// Returns checkpoint LSN from controlfile
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
.put_rel_creation(rel, nblocks as u32, ctx)
.await
{
match e {
RelationError::AlreadyExists => {
debug!("Relation {} already exist. We must be extending it.", rel)
match e.kind {
WalIngestErrorKind::RelationAlreadyExists(rel) => {
debug!("Relation {rel} already exists. We must be extending it.")
}
_ => return Err(e.into()),
}

View File

@@ -105,6 +105,7 @@ pub fn spawn(
pg_auth: Option<Arc<SwappableJwtAuth>>,
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
perf_trace_dispatch,
tcp_listener,
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
libpq_ctx,
cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
perf_trace_dispatch: Option<Dispatch>,
listener: tokio::net::TcpListener,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
local_auth,
socket,
auth_type,
tls_config.clone(),
pipelining_config.clone(),
connection_ctx,
connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
auth: Option<Arc<SwappableJwtAuth>>,
socket: tokio::net::TcpStream,
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
connection_ctx: RequestContext,
cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
cancel.clone(),
gate_guard,
);
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
let pgbackend =
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
match pgbackend.run(&mut conn_handler, &cancel).await {
Ok(()) => {

View File

@@ -9,8 +9,9 @@
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
use std::ops::{ControlFlow, Range};
use crate::PERF_TRACE_TARGET;
use anyhow::{Context, ensure};
use crate::walingest::{WalIngestError, WalIngestErrorKind};
use crate::{PERF_TRACE_TARGET, ensure_walingest};
use anyhow::Context;
use bytes::{Buf, Bytes, BytesMut};
use enum_map::Enum;
use itertools::Itertools;
@@ -136,12 +137,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
#[derive(Debug, thiserror::Error)]
pub enum RelationError {
#[error("Relation Already Exists")]
AlreadyExists,
#[error("invalid relnode")]
InvalidRelnode,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
@@ -1478,8 +1475,8 @@ impl DatadirModification<'_> {
}
/// Set the current lsn
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
ensure_walingest!(
lsn >= self.lsn,
"setting an older lsn {} than {} is not allowed",
lsn,
@@ -1578,7 +1575,7 @@ impl DatadirModification<'_> {
&mut self,
rel: RelTag,
ctx: &RequestContext,
) -> Result<u32, PageReconstructError> {
) -> Result<u32, WalIngestError> {
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1590,13 @@ impl DatadirModification<'_> {
.await?
{
// create it with 0 size initially, the logic below will extend it
self.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
self.put_rel_creation(rel, 0, ctx).await?;
Ok(0)
} else {
self.tline
Ok(self
.tline
.get_rel_size(rel, Version::Modified(self), ctx)
.await
.await?)
}
}
@@ -1637,11 +1633,14 @@ impl DatadirModification<'_> {
// TODO(vlad): remove this argument and replace the shard check with is_key_local
shard: &ShardIdentity,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let mut gaps_at_lsns = Vec::default();
for meta in batch.metadata.iter() {
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
let key = Key::from_compact(meta.key());
let (rel, blkno) = key
.to_rel_block()
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
let new_nblocks = blkno + 1;
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1682,8 @@ impl DatadirModification<'_> {
rel: RelTag,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -1696,7 +1695,7 @@ impl DatadirModification<'_> {
segno: u32,
blknum: BlockNumber,
rec: NeonWalRecord,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
if !self.tline.tenant_shard_id.is_shard_zero() {
return Ok(());
}
@@ -1714,14 +1713,11 @@ impl DatadirModification<'_> {
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
}
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
@@ -1733,15 +1729,12 @@ impl DatadirModification<'_> {
segno: u32,
blknum: BlockNumber,
img: Bytes,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver at {}",
key
);
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
}
self.put(key, Value::Image(img));
Ok(())
@@ -1751,15 +1744,11 @@ impl DatadirModification<'_> {
&mut self,
rel: RelTag,
blknum: BlockNumber,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
let key = rel_block_to_key(rel, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
}
let batch = self
@@ -1776,15 +1765,11 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
let key = slru_block_to_key(kind, segno, blknum);
if !key.is_valid_key_on_write_path() {
anyhow::bail!(
"the request contains data not supported by pageserver: {} @ {}",
key,
self.lsn
);
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
}
let batch = self
@@ -1832,8 +1817,10 @@ impl DatadirModification<'_> {
dbnode: Oid,
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
) -> Result<(), WalIngestError> {
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1861,13 @@ impl DatadirModification<'_> {
xid: u64,
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
// Add it to the directory entry
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
@@ -1891,7 +1878,7 @@ impl DatadirModification<'_> {
let xid = xid as u32;
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
if !dir.xids.insert(xid) {
anyhow::bail!("twophase file for xid {} already exists", xid);
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
}
self.pending_directory_entries.push((
DirectoryKind::TwoPhase,
@@ -1909,22 +1896,22 @@ impl DatadirModification<'_> {
&mut self,
origin_id: RepOriginId,
origin_lsn: Lsn,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let key = repl_origin_key(origin_id);
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
Ok(())
}
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
self.set_replorigin(origin_id, Lsn::INVALID).await
}
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
self.put(CONTROLFILE_KEY, Value::Image(img));
Ok(())
}
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
self.put(CHECKPOINT_KEY, Value::Image(img));
Ok(())
}
@@ -1934,7 +1921,7 @@ impl DatadirModification<'_> {
spcnode: Oid,
dbnode: Oid,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1960,21 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> Result<(), RelationError> {
) -> Result<(), WalIngestError> {
if rel.relnode == 0 {
return Err(RelationError::InvalidRelnode);
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
"invalid relnode"
)))?;
}
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
let dbdir_exists =
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
// Didn't exist. Update dbdir
e.insert(false);
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
let buf = DbDirectory::ser(&dbdir)?;
self.pending_directory_entries.push((
DirectoryKind::Db,
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1991,25 @@ impl DatadirModification<'_> {
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
};
let v2_enabled = self.maybe_enable_rel_size_v2()?;
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
if v2_enabled {
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
}
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
let val = self
.sparse_get(sparse_rel_dir_key, ctx)
.await
.map_err(|e| RelationError::Other(e.into()))?;
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
let val = RelDirExists::decode_option(val)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
if val == RelDirExists::Exists {
return Err(RelationError::AlreadyExists);
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
}
self.put(
sparse_rel_dir_key,
@@ -2039,9 +2025,7 @@ impl DatadirModification<'_> {
// will be key not found errors if we don't create an empty one for rel_size_v2.
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
)),
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
);
}
self.pending_directory_entries
@@ -2049,7 +2033,7 @@ impl DatadirModification<'_> {
} else {
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
return Err(RelationError::AlreadyExists);
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
}
if !dbdir_exists {
self.pending_directory_entries
@@ -2059,9 +2043,7 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
self.put(
rel_dir_key,
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
);
}
@@ -2086,8 +2068,8 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
if self
.tline
.get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2099,8 @@ impl DatadirModification<'_> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
) -> Result<(), WalIngestError> {
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
// Put size
let size_key = rel_size_to_key(rel);
@@ -2142,8 +2124,10 @@ impl DatadirModification<'_> {
&mut self,
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
) -> Result<(), WalIngestError> {
let v2_enabled = self
.maybe_enable_rel_size_v2()
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2147,7 @@ impl DatadirModification<'_> {
let key =
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
if val == RelDirExists::Exists {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2190,7 @@ impl DatadirModification<'_> {
segno: u32,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
// Add it to the directory entry
@@ -2215,7 +2199,7 @@ impl DatadirModification<'_> {
let mut dir = SlruSegmentDirectory::des(&buf)?;
if !dir.segments.insert(segno) {
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
}
self.pending_directory_entries.push((
DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2226,7 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
nblocks: BlockNumber,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
assert!(self.tline.tenant_shard_id.is_shard_zero());
// Put size
@@ -2258,7 +2242,7 @@ impl DatadirModification<'_> {
kind: SlruKind,
segno: u32,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
// Remove it from the directory entry
let dir_key = slru_dir_to_key(kind);
let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2267,7 @@ impl DatadirModification<'_> {
}
/// Drop a relmapper file (pg_filenode.map)
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
// TODO
Ok(())
}
@@ -2293,7 +2277,7 @@ impl DatadirModification<'_> {
&mut self,
xid: u64,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
// Remove it from the directory entry
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2292,8 @@ impl DatadirModification<'_> {
));
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
} else {
let xid: u32 = u32::try_from(xid)?;
let xid: u32 = u32::try_from(xid)
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
let mut dir = TwoPhaseDirectory::des(&buf)?;
if !dir.xids.remove(&xid) {
@@ -2333,7 +2318,7 @@ impl DatadirModification<'_> {
path: &str,
content: &[u8],
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let key = aux_file::encode_aux_file_key(path);
// retrieve the key from the engine
let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2327,7 @@ impl DatadirModification<'_> {
Err(e) => return Err(e.into()),
};
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
aux_file::decode_file_value(old_val)?
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
} else {
Vec::new()
};
@@ -2387,7 +2372,8 @@ impl DatadirModification<'_> {
}
(None, true) => warn!("removing non-existing aux file: {}", path),
}
let new_val = aux_file::encode_file_value(&new_files)?;
let new_val = aux_file::encode_file_value(&new_files)
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
self.put(key, Value::Image(new_val.into()));
Ok(())

View File

@@ -11571,6 +11571,99 @@ mod tests {
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
let (tenant, ctx) = harness.load().await;
fn get_key(id: u32) -> Key {
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
let img_layer = (0..10)
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
.collect_vec();
let delta1 = vec![
(
get_key(1),
Lsn(0x20),
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
),
(
get_key(1),
Lsn(0x24),
Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
),
(
get_key(1),
Lsn(0x28),
// This record will fail to redo
Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
),
];
let tline = tenant
.create_test_timeline_with_layers(
TIMELINE_ID,
Lsn(0x10),
DEFAULT_PG_VERSION,
&ctx,
vec![], // in-memory layers
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
Lsn(0x20)..Lsn(0x30),
delta1,
)], // delta layers
vec![(Lsn(0x10), img_layer)], // image layers
Lsn(0x50),
)
.await?;
{
tline
.applied_gc_cutoff_lsn
.lock_for_write()
.store_and_unlock(Lsn(0x30))
.wait()
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
within_ancestor_pitr: false,
};
}
let cancel = CancellationToken::new();
// Compaction will fail, but should not fire any critical error.
// Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
// process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
// compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
let res = tline
.compact_with_gc(
&cancel,
CompactOptions {
compact_key_range: None,
compact_lsn_range: None,
..Default::default()
},
&ctx,
)
.await;
assert!(res.is_err());
Ok(())
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {

View File

@@ -1269,7 +1269,7 @@ mod test {
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
let range = input_start..input_end;
// Build an image layer to filter
@@ -1314,7 +1314,7 @@ mod test {
let shard_identity = ShardIdentity::new(
ShardNumber(shard_number),
shard_count,
ShardStripeSize(0x8000),
ShardStripeSize(0x800),
)
.unwrap();
let harness = TenantHarness::create_custom(
@@ -1368,12 +1368,12 @@ mod test {
// This exact size and those below will need updating as/when the layer encoding changes, but
// should be deterministic for a given version of the format, as we used no randomness generating the input.
assert_eq!(original_size, 1597440);
assert_eq!(original_size, 122880);
match shard_number {
0 => {
// We should have written out just one stripe for our shard identity
assert_eq!(wrote_keys, 0x8000);
assert_eq!(wrote_keys, 0x800);
let replacement = replacement.unwrap();
// We should have dropped some of the data
@@ -1381,7 +1381,7 @@ mod test {
assert!(replacement.metadata().file_size > 0);
// Assert that we dropped ~3/4 of the data.
assert_eq!(replacement.metadata().file_size, 417792);
assert_eq!(replacement.metadata().file_size, 49152);
}
1 => {
// Shard 1 has no keys in our input range
@@ -1390,19 +1390,19 @@ mod test {
}
2 => {
// Shard 2 has one stripes in the input range
assert_eq!(wrote_keys, 0x8000);
assert_eq!(wrote_keys, 0x800);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 417792);
assert_eq!(replacement.metadata().file_size, 49152);
}
3 => {
// Shard 3 has two stripes in the input range
assert_eq!(wrote_keys, 0x10000);
assert_eq!(wrote_keys, 0x1000);
let replacement = replacement.unwrap();
assert!(replacement.metadata().file_size < original_size);
assert!(replacement.metadata().file_size > 0);
assert_eq!(replacement.metadata().file_size, 811008);
assert_eq!(replacement.metadata().file_size, 73728);
}
_ => unreachable!(),
}

View File

@@ -1141,6 +1141,7 @@ impl Timeline {
) -> Result<(), CompactionError> {
let mut drop_layers = Vec::new();
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
let mut rewrite_max_exceeded: bool = false;
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
@@ -1149,12 +1150,7 @@ impl Timeline {
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
// are rewriting layers.
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
tracing::info!(
"starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
*latest_gc_cutoff,
self.gc_info.read().unwrap().cutoffs.time
);
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
let layers = self.layers.read().await;
for layer_desc in layers.layer_map()?.iter_historic_layers() {
@@ -1172,8 +1168,8 @@ impl Timeline {
// This ancestral layer only covers keys that belong to other shards.
// We include the full metadata in the log: if we had some critical bug that caused
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
info!(%layer, old_metadata=?layer.metadata(),
"dropping layer after shard split, contains no keys for this shard.",
debug!(%layer, old_metadata=?layer.metadata(),
"dropping layer after shard split, contains no keys for this shard",
);
if cfg!(debug_assertions) {
@@ -1235,9 +1231,10 @@ impl Timeline {
}
if layers_to_rewrite.len() >= rewrite_max {
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
layers_to_rewrite.len()
);
rewrite_max_exceeded = true;
continue;
}
@@ -1245,9 +1242,24 @@ impl Timeline {
layers_to_rewrite.push(layer);
}
// Drop read lock on layer map before we start doing time-consuming I/O
// Drop read lock on layer map before we start doing time-consuming I/O.
drop(layers);
// Drop out early if there's nothing to do.
if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
return Ok(());
}
info!(
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
(latest_gc_cutoff={} pitr_cutoff={})",
layers_to_rewrite.len(),
drop_layers.len(),
*latest_gc_cutoff,
pitr_cutoff,
);
let started = Instant::now();
let mut replace_image_layers = Vec::new();
for layer in layers_to_rewrite {
@@ -1255,7 +1267,7 @@ impl Timeline {
return Err(CompactionError::ShuttingDown);
}
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
info!(layer=%layer, "rewriting layer after shard split");
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timeline_id,
@@ -1295,7 +1307,7 @@ impl Timeline {
.map_err(CompactionError::Other)?;
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
.map_err(CompactionError::Other)?;
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
layer.metadata().file_size,
new_layer.metadata().file_size);
@@ -1307,6 +1319,12 @@ impl Timeline {
}
}
for layer in &drop_layers {
info!(%layer, old_metadata=?layer.metadata(),
"dropping layer after shard split (no keys for this shard)",
);
}
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
// to remote index) and be removed. This is inefficient but safe.
@@ -1322,6 +1340,7 @@ impl Timeline {
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
// load.
info!("shard ancestor compaction waiting for uploads");
match self.remote_client.wait_completion().await {
Ok(()) => (),
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
@@ -1330,6 +1349,15 @@ impl Timeline {
}
}
info!(
"shard ancestor compaction done in {:.3}s{}",
started.elapsed().as_secs_f64(),
match rewrite_max_exceeded {
true => format!(", more work pending due to rewrite_max={rewrite_max}"),
false => String::new(),
}
);
fail::fail_point!("compact-shard-ancestors-persistent");
Ok(())

View File

@@ -580,6 +580,7 @@ impl ConnectionManagerState {
);
Ok(())
}
WalReceiverError::Cancelled => Ok(()),
WalReceiverError::Other(e) => {
// give out an error to have task_mgr give it a really verbose logging
if cancellation.is_cancelled() {

View File

@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
/// Generic error
Other(anyhow::Error),
ClosedGate,
Cancelled,
}
impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
// with a similar error.
},
WalReceiverError::SuccessfulCompletion(_) => {}
WalReceiverError::Cancelled => {
debug!("Connection cancelled")
}
WalReceiverError::ClosedGate => {
// doesn't happen at runtime
}
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
.await
.map_err(|e| match e.kind {
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
_ => WalReceiverError::Other(e.into()),
})?;
let shard = vec![*timeline.get_shard_identity()];

View File

@@ -21,13 +21,13 @@
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use std::backtrace::Backtrace;
use std::collections::HashMap;
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant, SystemTime};
use anyhow::{Result, bail};
use bytes::{Buf, Bytes};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::key::{Key, rel_block_to_key};
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
fsm_logical_to_physical, pg_constants,
};
use tracing::*;
use utils::bin_ser::SerializeError;
use utils::bin_ser::{DeserializeError, SerializeError};
use utils::lsn::Lsn;
use utils::rate_limit::RateLimit;
use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
timestamp_invalid_msg_ratelimit: RateLimit,
}
pub struct WalIngestError {
pub backtrace: std::backtrace::Backtrace,
pub kind: WalIngestErrorKind,
}
#[derive(thiserror::Error, Debug)]
pub enum WalIngestErrorKind {
#[error(transparent)]
#[allow(private_interfaces)]
PageReconstructError(#[from] PageReconstructError),
#[error(transparent)]
DeserializationFailure(#[from] DeserializeError),
#[error(transparent)]
SerializationFailure(#[from] SerializeError),
#[error("the request contains data not supported by pageserver: {0} @ {1}")]
InvalidKey(Key, Lsn),
#[error("twophase file for xid {0} already exists")]
FileAlreadyExists(u64),
#[error("slru segment {0:?}/{1} already exists")]
SlruAlreadyExists(SlruKind, u32),
#[error("relation already exists")]
RelationAlreadyExists(RelTag),
#[error("invalid reldir key {0}")]
InvalidRelDirKey(Key),
#[error(transparent)]
LogicalError(anyhow::Error),
#[error(transparent)]
EncodeAuxFileError(anyhow::Error),
#[error(transparent)]
MaybeRelSizeV2Error(anyhow::Error),
#[error("timeline shutting down")]
Cancelled,
}
impl<T> From<T> for WalIngestError
where
WalIngestErrorKind: From<T>,
{
fn from(value: T) -> Self {
WalIngestError {
backtrace: Backtrace::capture(),
kind: WalIngestErrorKind::from(value),
}
}
}
impl std::error::Error for WalIngestError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
self.kind.source()
}
}
impl core::fmt::Display for WalIngestError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.kind.fmt(f)
}
}
impl core::fmt::Debug for WalIngestError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
if f.alternate() {
f.debug_map()
.key(&"backtrace")
.value(&self.backtrace)
.key(&"kind")
.value(&self.kind)
.finish()
} else {
writeln!(f, "Error: {:?}", self.kind)?;
if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
}
Ok(())
}
}
}
#[macro_export]
macro_rules! ensure_walingest {
($($t:tt)*) => {
_ = || -> Result<(), anyhow::Error> {
anyhow::ensure!($($t)*);
Ok(())
}().map_err(WalIngestErrorKind::LogicalError)?;
};
}
impl WalIngest {
pub async fn new(
timeline: &Timeline,
startpoint: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<WalIngest> {
) -> Result<WalIngest, WalIngestError> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
interpreted: InterpretedWalRecord,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<bool> {
) -> Result<bool, WalIngestError> {
WAL_INGEST.records_received.inc();
let prev_len = modification.len();
@@ -288,7 +377,7 @@ impl WalIngest {
}
/// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
let next_full_xid =
enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
@@ -298,9 +387,9 @@ impl WalIngest {
if xid > next_xid {
// Wraparound occurred, must be from a prev epoch.
if epoch == 0 {
bail!(
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
"apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
);
)))?;
}
epoch -= 1;
}
@@ -313,7 +402,7 @@ impl WalIngest {
clear_vm_bits: ClearVmBits,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let ClearVmBits {
new_heap_blkno,
old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
create: DbaseCreate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let DbaseCreate {
db_id,
tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
dbase_drop: DbaseDrop,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let DbaseDrop {
db_id,
tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
create: SmgrCreate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let SmgrCreate { rel } = create;
self.put_rel_creation(modification, rel, ctx).await?;
Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
truncate: XlSmgrTruncate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let XlSmgrTruncate {
blkno,
rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
record: XactRecord,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let (xact_common, is_commit, is_prepared) = match record {
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
truncate: ClogTruncate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let ClogTruncate {
pageno,
oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
zero_page: ClogZeroPage,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
let ClogZeroPage { segno, rpageno } = zero_page;
self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
&mut self,
modification: &mut DatadirModification,
xlrec: &XlMultiXactCreate,
) -> Result<()> {
) -> Result<(), WalIngestError> {
// Create WAL record for updating the multixact-offsets page
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
modification: &mut DatadirModification<'_>,
xlrec: &XlMultiXactTruncate,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
let (maxsegment, startsegment, endsegment) =
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
zero_page: MultiXactZeroPage,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
let MultiXactZeroPage {
slru_kind,
segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
update: RelmapUpdate,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
let RelmapUpdate { update, buf } = update;
modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
raw_record: RawXlogRecord,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
let RawXlogRecord { info, lsn, mut buf } = raw_record;
let pg_version = modification.tline.pg_version;
@@ -1235,12 +1324,12 @@ impl WalIngest {
put: PutLogicalMessage,
modification: &mut DatadirModification<'_>,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
let PutLogicalMessage { path, buf } = put;
modification.put_file(path.as_str(), &buf, ctx).await
}
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
match record {
StandbyRecord::RunningXacts(running_xacts) => {
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
&mut self,
record: ReploriginRecord,
modification: &mut DatadirModification<'_>,
) -> Result<()> {
) -> Result<(), WalIngestError> {
match record {
ReploriginRecord::Set(set) => {
modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
modification: &mut DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
modification.put_rel_creation(rel, 0, ctx).await?;
Ok(())
}
@@ -1291,7 +1380,7 @@ impl WalIngest {
blknum: BlockNumber,
img: Bytes,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
) -> Result<(), WalIngestError> {
self.handle_rel_extend(modification, rel, blknum, ctx)
.await?;
modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
blknum: BlockNumber,
rec: NeonWalRecord,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
self.handle_rel_extend(modification, rel, blknum, ctx)
.await?;
modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
modification.put_rel_truncation(rel, nblocks, ctx).await?;
Ok(())
}
@@ -1329,7 +1418,7 @@ impl WalIngest {
rel: RelTag,
blknum: BlockNumber,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
) -> Result<(), WalIngestError> {
let new_nblocks = blknum + 1;
// Check if the relation exists. We implicitly create relations on first
// record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
blknum: BlockNumber,
img: Bytes,
ctx: &RequestContext,
) -> Result<()> {
) -> Result<(), WalIngestError> {
if !self.shard.is_shard_zero() {
return Ok(());
}
@@ -1441,7 +1530,7 @@ impl WalIngest {
segno: u32,
blknum: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
) -> Result<(), WalIngestError> {
// we don't use a cache for this like we do for relations. SLRUS are explcitly
// extended with ZEROPAGE records, not with commit records, so it happens
// a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
use anyhow::Result;
use postgres_ffi::RELSEG_SIZE;
use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
}
#[tokio::test]
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
for i in 14..=16 {
dispatch_pgversion!(i, {
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;

View File

@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
/* GUCs */
char *neon_timeline;
char *neon_tenant;
char *neon_project_id;
char *neon_branch_id;
char *neon_endpoint_id;
int32 max_cluster_size;
char *page_server_connstring;
char *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomStringVariable("neon.project_id",
"Neon project_id the server is running on",
NULL,
&neon_project_id,
"",
PGC_POSTMASTER,
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomStringVariable("neon.branch_id",
"Neon branch_id the server is running on",
NULL,
&neon_branch_id,
"",
PGC_POSTMASTER,
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomStringVariable("neon.endpoint_id",
"Neon endpoint_id the server is running on",
NULL,
&neon_endpoint_id,
"",
PGC_POSTMASTER,
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
NULL,

View File

@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
wp->config = config;
wp->api = api;
wp->state = WPS_COLLECTING_TERMS;
wp->mconf.generation = INVALID_GENERATION;
wp->mconf.members.len = 0;
wp->mconf.new_members.len = 0;
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
wp_log(FATAL, "enabling generations requires protocol version 3");
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
static bool
WalProposerGenerationsEnabled(WalProposer *wp)
{
return wp->safekeepers_generation != 0;
return wp->safekeepers_generation != INVALID_GENERATION;
}
/*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
}
/*
* Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
* members_safekeepers & new_members_safekeepers to sk.
*/
static void
UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
{
/* members_safekeepers etc are fixed size, sanity check mconf size */
if (wp->mconf.members.len > MAX_SAFEKEEPERS)
wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
/* node id is not known until greeting is received */
if (sk->state < SS_WAIT_VOTING)
return;
/* 0 is assumed to be invalid node id, should never happen */
if (sk->greetResponse.nodeId == 0)
{
wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
return;
}
for (uint32 i = 0; i < wp->mconf.members.len; i++)
{
SafekeeperId *sk_id = &wp->mconf.members.m[i];
if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
{
/*
* If mconf or list of safekeepers to connect to changed (the
* latter always currently goes through restart though),
* ResetMemberSafekeeperPtrs is expected to be called before
* UpdateMemberSafekeeperPtr. So, other value suggests that we are
* connected to the same sk under different host name, complain
* about that.
*/
if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
{
wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
}
wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
wp->members_safekeepers[i] = sk;
}
}
/* repeat for new_members */
for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
{
SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
{
if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
{
wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
}
wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
wp->new_members_safekeepers[i] = sk;
}
}
}
/*
* Reset wp->members_safekeepers & new_members_safekeepers and refill them.
* Called after wp changes mconf.
*/
static void
ResetMemberSafekeeperPtrs(WalProposer *wp)
{
memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
for (int i = 0; i < wp->n_safekeepers; i++)
{
if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
}
}
static uint32
MsetQuorum(MemberSet *mset)
{
Assert(mset->len > 0);
return mset->len / 2 + 1;
}
/* Does n forms quorum in mset? */
static bool
MsetHasQuorum(MemberSet *mset, uint32 n)
{
return n >= MsetQuorum(mset);
}
/*
* TermsCollected helper for a single member set `mset`.
*
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
* or new_members_safekeepers.
*/
static bool
TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
{
uint32 n_greeted = 0;
for (uint32 i = 0; i < wp->mconf.members.len; i++)
{
Safekeeper *sk = msk[i];
if (sk != NULL && sk->state == SS_WAIT_VOTING)
{
if (n_greeted > 0)
appendStringInfoString(s, ", ");
appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
n_greeted++;
}
}
appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
return MsetHasQuorum(mset, n_greeted);
}
/*
* Have we received greeting from enough (quorum) safekeepers to start voting?
*/
static bool
TermsCollected(WalProposer *wp)
{
return wp->n_connected >= wp->quorum;
StringInfoData s; /* str for logging */
bool collected = false;
/* legacy: generations disabled */
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
{
collected = wp->n_connected >= wp->quorum;
if (collected)
{
wp->propTerm++;
wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
}
return collected;
}
/*
* With generations enabled, we start campaign only when 1) some mconf is
* actually received 2) we have greetings from majority of members as well
* as from majority of new_members if it exists.
*/
if (wp->mconf.generation == INVALID_GENERATION)
return false;
initStringInfo(&s);
appendStringInfoString(&s, "mset greeters: ");
if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
goto res;
if (wp->mconf.new_members.len > 0)
{
appendStringInfoString(&s, ", new_mset greeters: ");
if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
goto res;
}
wp->propTerm++;
wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
collected = true;
res:
pfree(s.data);
return collected;
}
static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
pfree(mconf_toml);
/*
* Adopt mconf of safekeepers if it is higher. TODO: mconf change should
* restart wp if it started voting.
* Adopt mconf of safekeepers if it is higher.
*/
if (sk->greetResponse.mconf.generation > wp->mconf.generation)
{
/* sanity check before adopting, should never happen */
if (sk->greetResponse.mconf.members.len == 0)
{
wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
}
/*
* If we at least started campaign, restart wp to get elected in the
* new mconf. Note: in principle once wp is already elected
* re-election is not required, but being conservative here is not
* bad.
*
* TODO: put mconf to shmem to immediately pick it up on start,
* otherwise if some safekeeper(s) misses latest mconf and gets
* connected the first, it may cause redundant restarts here.
*
* More generally, it would be nice to restart walproposer (wiping
* election state) without restarting the process. In particular, that
* would allow sync-safekeepers not to die here if it intersected with
* sk migration (as well as remove 1s delay).
*
* Note that assign_neon_safekeepers also currently restarts the
* process, so during normal migration walproposer may restart twice.
*/
if (wp->state >= WPS_CAMPAIGN)
{
wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
}
MembershipConfigurationFree(&wp->mconf);
MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
ResetMemberSafekeeperPtrs(wp);
/* full conf was just logged above */
wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
/* Protocol is all good, move to voting. */
sk->state = SS_WAIT_VOTING;
/* In greeting safekeeper sent its id; update mappings accordingly. */
UpdateMemberSafekeeperPtr(wp, sk);
/*
* Note: it would be better to track the counter on per safekeeper basis,
* but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
/* We're still collecting terms from the majority. */
wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
/* Quorum is acquried, prepare the vote request. */
/* Quorum is acquired, prepare the vote request. */
if (TermsCollected(wp))
{
wp->propTerm++;
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
wp->state = WPS_CAMPAIGN;
wp->voteRequest.pam.tag = 'v';
wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
&sk->outbuf, wp->config->proto_version);
/* We have quorum for voting, send our vote request */
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
wp->voteRequest.generation, wp->voteRequest.term);
wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
/* On failure, logging & resetting is handled */
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
return;
wp_log(LOG,
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
"got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
sk->voteResponse.voteGiven,
GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
}
}
/*
* VotesCollected helper for a single member set `mset`.
*
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
* or new_members_safekeepers.
*/
static bool
VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
{
uint32 n_votes = 0;
for (uint32 i = 0; i < wp->mconf.members.len; i++)
{
Safekeeper *sk = msk[i];
if (sk != NULL && sk->state == SS_WAIT_ELECTED)
{
Assert(sk->voteResponse.voteGiven);
/*
* Find the highest vote. NULL check is for the legacy case where
* safekeeper might be not initialized with LSN at all and return
* 0 LSN in the vote response; we still want to set donor to
* something in this case.
*/
if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
wp->donor == NULL)
{
wp->donorLastLogTerm = GetLastLogTerm(sk);
wp->propTermStartLsn = sk->voteResponse.flushLsn;
wp->donor = sk;
}
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
if (n_votes > 0)
appendStringInfoString(s, ", ");
appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
n_votes++;
}
}
appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
return MsetHasQuorum(mset, n_votes);
}
/*
* Checks if enough votes has been collected to get elected and if that's the
* case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
static bool
VotesCollected(WalProposer *wp)
{
int n_ready = 0;
StringInfoData s; /* str for logging */
bool collected = false;
/* assumed to be called only when not elected yet */
Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
wp->donorLastLogTerm = 0;
wp->truncateLsn = InvalidXLogRecPtr;
for (int i = 0; i < wp->n_safekeepers; i++)
/* legacy: generations disabled */
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
{
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
{
n_ready++;
int n_ready = 0;
if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
for (int i = 0; i < wp->n_safekeepers; i++)
{
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
{
wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
wp->donor = i;
n_ready++;
if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
wp->donor == NULL)
{
wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
wp->donor = &wp->safekeeper[i];
}
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
}
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
}
collected = n_ready >= wp->quorum;
if (collected)
{
wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
}
return collected;
}
return n_ready >= wp->quorum;
/*
* if generations are enabled we're expected to get to voting only when
* mconf is established.
*/
Assert(wp->mconf.generation != INVALID_GENERATION);
/*
* We must get votes from both msets if both are present.
*/
initStringInfo(&s);
appendStringInfoString(&s, "mset voters: ");
if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
goto res;
if (wp->mconf.new_members.len > 0)
{
appendStringInfoString(&s, ", new_mset voters: ");
if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
goto res;
}
wp_log(LOG, "walproposer elected, %s", s.data);
collected = true;
res:
pfree(s.data);
return collected;
}
/*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
* that only for logical replication (and switching logical walsenders to
* neon_walreader is a todo.)
*/
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
if (!wp->api.recovery_download(wp, wp->donor))
{
wp_log(FATAL, "failed to download WAL for logical replicaiton");
}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
/*
* Proposer's term history is the donor's + its own entry.
*/
dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
dth = &wp->donor->voteResponse.termHistory;
wp->propTermHistory.n_entries = dth->n_entries + 1;
wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
wp->quorum,
wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
wp->propTerm,
LSN_FORMAT_ARGS(wp->propTermStartLsn),
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
wp->donor->host, wp->donor->port,
LSN_FORMAT_ARGS(wp->truncateLsn));
/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)
readAnything = true;
/* should never happen: sk is expected to send ERROR instead */
if (sk->appendResponse.generation != wp->mconf.generation)
{
wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
sk->greetResponse.nodeId, sk->host, sk->port,
sk->appendResponse.generation, wp->mconf.generation);
}
if (sk->appendResponse.term > wp->propTerm)
{
/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
}
/*
* Calculate WAL position acknowledged by quorum
* GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
*
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
* or new_members_safekeepers.
*/
static XLogRecPtr
GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
{
XLogRecPtr responses[MAX_SAFEKEEPERS];
/*
* Sort acknowledged LSNs
* Ascending sort acknowledged LSNs.
*/
for (int i = 0; i < wp->n_safekeepers; i++)
Assert(mset->len <= MAX_SAFEKEEPERS);
for (uint32 i = 0; i < mset->len; i++)
{
Safekeeper *sk = msk[i];
/*
* Like in Raft, we aren't allowed to commit entries from previous
* terms, so ignore reported LSN until it gets to epochStartLsn.
* terms, so ignore reported LSN until it gets to propTermStartLsn.
*
* Note: we ignore sk state, which is ok: before first ack flushLsn is
* 0, and later we just preserve value across reconnections. It would
* be ok to check for SS_ACTIVE as well.
*/
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
{
responses[i] = sk->appendResponse.flushLsn;
}
else
{
responses[i] = 0;
}
}
qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);
/*
* Get the smallest LSN committed by quorum
* And get value committed by the quorum. A way to view this: to get the
* highest value committed on the quorum, in the ordered array we skip n -
* n_quorum elements to get to the first (lowest) value present on all sks
* of the highest quorum.
*/
return responses[wp->n_safekeepers - wp->quorum];
return responses[mset->len - MsetQuorum(mset)];
}
/*
* Calculate WAL position acknowledged by quorum, i.e. which may be regarded
* committed.
*
* Zero may be returned when there is no quorum of nodes recovered to term start
* lsn which sent feedback yet.
*/
static XLogRecPtr
GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
{
XLogRecPtr committed;
/* legacy: generations disabled */
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
{
XLogRecPtr responses[MAX_SAFEKEEPERS];
/*
* Sort acknowledged LSNs
*/
for (int i = 0; i < wp->n_safekeepers; i++)
{
/*
* Like in Raft, we aren't allowed to commit entries from previous
* terms, so ignore reported LSN until it gets to
* propTermStartLsn.
*
* Note: we ignore sk state, which is ok: before first ack
* flushLsn is 0, and later we just preserve value across
* reconnections. It would be ok to check for SS_ACTIVE as well.
*/
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
}
qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
/*
* Get the smallest LSN committed by quorum
*/
return responses[wp->n_safekeepers - wp->quorum];
}
committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
if (wp->mconf.new_members.len > 0)
{
XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
committed = Min(committed, new_mset_committed);
}
return committed;
}
/*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
int i;
XLogRecPtr donor_lsn = InvalidXLogRecPtr;
if (wp->n_votes < wp->quorum)
if (wp->state < WPS_ELECTED)
{
wp_log(WARNING, "UpdateDonorShmem called before elections are won");
return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
* about its position immediately after election before any feedbacks are
* sent.
*/
if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
if (wp->donor->state >= SS_WAIT_ELECTED)
{
donor = &wp->safekeeper[wp->donor];
donor = wp->donor;
donor_lsn = wp->propTermStartLsn;
}
@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
}
/*
* Generally sync is done when majority switched the epoch so we committed
* epochStartLsn and made the majority aware of it, ensuring they are
* ready to give all WAL to pageserver. It would mean whichever majority
* is alive, there will be at least one safekeeper who is able to stream
* WAL to pageserver to make basebackup possible. However, since at the
* moment we don't have any good mechanism of defining the healthy and
* most advanced safekeeper who should push the wal into pageserver and
* Generally sync is done when majority reached propTermStartLsn so we
* committed it and made the majority aware of it, ensuring they are ready
* to give all WAL to pageserver. It would mean whichever majority is
* alive, there will be at least one safekeeper who is able to stream WAL
* to pageserver to make basebackup possible. However, since at the moment
* we don't have any good mechanism of defining the healthy and most
* advanced safekeeper who should push the wal into pageserver and
* basically the random one gets connected, to prevent hanging basebackup
* (due to pageserver connecting to not-synced-safekeeper) we currently
* wait for all seemingly alive safekeepers to get synced.
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
n_synced++;
}
if (n_synced >= wp->quorum)
if (newCommitLsn >= wp->propTermStartLsn)
{
/* A quorum of safekeepers has been synced! */

View File

@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
* This and following structs pair ones in membership.rs.
*/
typedef uint32 Generation;
#define INVALID_GENERATION 0
typedef struct SafekeeperId
{
@@ -771,7 +772,17 @@ typedef struct WalProposer
/* Current walproposer membership configuration */
MembershipConfiguration mconf;
/* (n_safekeepers / 2) + 1 */
/*
* Parallels mconf.members with pointers to the member's slot in
* safekeepers array of connections, or NULL if such member is not
* connected. Helps to avoid looking slot per id through all
* .safekeepers[] when doing quorum checks.
*/
Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
/* As above, but for new_members. */
Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
int quorum;
/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
term_t donorLastLogTerm;
/* Most advanced acceptor */
int donor;
Safekeeper *donor;
/* timeline globally starts at this LSN */
XLogRecPtr timelineStartLsn;

View File

@@ -31,6 +31,7 @@ pub async fn task_main_https(
global_timelines: Arc<GlobalTimelines>,
) -> anyhow::Result<()> {
let cert_resolver = ReloadingCertificateResolver::new(
"main",
&conf.ssl_key_file,
&conf.ssl_cert_file,
conf.ssl_cert_reload_period,

View File

@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
TimelineImportRequest,
};
use pageserver_api::models::{
DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
ForwardOutcome::NotForwarded(req) => req,
};
let state = get_state(&req);
json_response(StatusCode::OK, state.service.step_down().await)
// Spawn a background task: once we start stepping down, we must finish: if the client drops
// their request we should avoid stopping in some part-stepped-down state.
let handle = tokio::spawn(async move {
let state = get_state(&req);
state.service.step_down().await
});
let result = handle
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, result)
}
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
)
}
async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
check_permissions(&req, Scope::PageServerApi)?;
maybe_rate_limit(&req, tenant_id).await;
let mut req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {
return res;
}
ForwardOutcome::NotForwarded(req) => req,
};
let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
let state = get_state(&req);
if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
return Err(ApiError::BadRequest(anyhow::anyhow!(
"tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
import_req.tenant_id,
import_req.timeline_id
)));
}
json_response(
StatusCode::OK,
state.service.timeline_import(import_req).await?,
)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -1949,6 +1991,16 @@ pub fn make_router(
RequestName("debug_v1_tenant_locate"),
)
})
.post(
"/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
|r| {
named_request_span(
r,
handle_timeline_import,
RequestName("debug_v1_timeline_import"),
)
},
)
.get("/debug/v1/scheduler", |r| {
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
})

View File

@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
let https_listener = tcp_listener::bind(https_addr)?;
let resolver = ReloadingCertificateResolver::new(
"main",
&args.ssl_key_file,
&args.ssl_cert_file,
*args.ssl_cert_reload_period,

View File

@@ -61,7 +61,7 @@ use utils::completion::Barrier;
use utils::generation::Generation;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::gate::Gate;
use utils::sync::gate::{Gate, GateGuard};
use utils::{failpoint_support, pausable_failpoint};
use crate::background_node_operations::{
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
new_stripe_size: Option<ShardStripeSize>,
/// Until this abort op is complete, no other operations may be done on the tenant
_tenant_lock: TracingExclusiveGuard<TenantOperations>,
/// The reconciler gate for the duration of the split operation, and any included abort.
_gate: GateGuard,
}
#[derive(thiserror::Error, Debug)]
@@ -1460,7 +1462,7 @@ impl Service {
// Retry until shutdown: we must keep this request object alive until it is properly
// processed, as it holds a lock guard that prevents other operations trying to do things
// to the tenant while it is in a weird part-split state.
while !self.cancel.is_cancelled() {
while !self.reconcilers_cancel.is_cancelled() {
match self.abort_tenant_shard_split(&op).await {
Ok(_) => break,
Err(e) => {
@@ -1473,9 +1475,12 @@ impl Service {
// when we retry, so that the abort op will succeed. If the abort op is failing
// for some other reason, we will keep retrying forever, or until a human notices
// and does something about it (either fixing a pageserver or restarting the controller).
tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
.await
.ok();
tokio::time::timeout(
Duration::from_secs(5),
self.reconcilers_cancel.cancelled(),
)
.await
.ok();
}
}
}
@@ -1847,6 +1852,7 @@ impl Service {
};
if insert {
let config = attach_req.config.clone().unwrap_or_default();
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1855,7 +1861,7 @@ impl Service {
generation: attach_req.generation_override.or(Some(0)),
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
config: serde_json::to_string(&config).unwrap(),
splitting: SplitState::default(),
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
.unwrap(),
@@ -1878,16 +1884,16 @@ impl Service {
Ok(()) => {
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(
let mut shard = TenantShard::new(
attach_req.tenant_shard_id,
TenantShard::new(
attach_req.tenant_shard_id,
ShardIdentity::unsharded(),
PlacementPolicy::Attached(0),
None,
),
ShardIdentity::unsharded(),
PlacementPolicy::Attached(0),
None,
);
shard.config = config;
let mut locked = self.inner.write().unwrap();
locked.tenants.insert(attach_req.tenant_shard_id, shard);
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
}
}
@@ -1972,11 +1978,12 @@ impl Service {
.set_attached(scheduler, attach_req.node_id);
tracing::info!(
"attach_hook: tenant {} set generation {:?}, pageserver {}",
"attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
attach_req.tenant_shard_id,
tenant_shard.generation,
// TODO: this is an odd number of 0xf's
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
attach_req.config,
);
// Trick the reconciler into not doing anything for this tenant: this helps
@@ -4910,7 +4917,7 @@ impl Service {
1,
10,
Duration::from_secs(5),
&self.cancel,
&self.reconcilers_cancel,
)
.await
{
@@ -5161,6 +5168,11 @@ impl Service {
)
.await;
let _gate = self
.reconcilers_gate
.enter()
.map_err(|_| ApiError::ShuttingDown)?;
let new_shard_count = ShardCount::new(split_req.new_shard_count);
let new_stripe_size = split_req.new_stripe_size;
@@ -5188,6 +5200,7 @@ impl Service {
new_shard_count,
new_stripe_size,
_tenant_lock,
_gate,
})
// Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
.ok();
@@ -5527,7 +5540,10 @@ impl Service {
"failpoint".to_string()
)));
failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
failpoint_support::sleep_millis_async!(
"shard-split-post-remote-sleep",
&self.reconcilers_cancel
);
tracing::info!(
"Split {} into {}",
@@ -5585,7 +5601,7 @@ impl Service {
stripe_size,
preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
},
&self.cancel,
&self.reconcilers_cancel,
)
.await
{
@@ -8670,9 +8686,24 @@ impl Service {
failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
self.inner.write().unwrap().step_down();
// TODO: would it make sense to have a time-out for this?
self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
.await;
// Wait for reconciliations to stop, or terminate this process if they
// fail to stop in time (this indicates a bug in shutdown)
tokio::select! {
_ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
tracing::info!("Reconciliations stopped, proceeding with step down");
}
_ = async {
failpoint_support::sleep_millis_async!("step-down-delay-timeout");
tokio::time::sleep(Duration::from_secs(10)).await
} => {
tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
// The caller may proceed to act as leader when it sees this request fail: reduce the chance
// of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
std::process::exit(1);
}
}
let mut global_observed = GlobalObservedState::default();
let locked = self.inner.read().unwrap();

View File

@@ -12,13 +12,16 @@ use crate::persistence::{
use crate::safekeeper::Safekeeper;
use anyhow::Context;
use http_utils::error::ApiError;
use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
use pageserver_api::controller_api::{
SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
};
use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
use safekeeper_api::membership::{MemberSet, SafekeeperId};
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::logging::SecretString;
use utils::lsn::Lsn;
use super::Service;
@@ -298,6 +301,31 @@ impl Service {
timeline_id,
})
}
/// Directly insert the timeline into the database without reconciling it with safekeepers.
///
/// Useful if the timeline already exists on the specified safekeepers,
/// but we want to make it storage controller managed.
pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
let persistence = TimelinePersistence {
tenant_id: req.tenant_id.to_string(),
timeline_id: req.timeline_id.to_string(),
start_lsn: Lsn::INVALID.into(),
generation: 1,
sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
new_sk_set: None,
cplane_notified_generation: 1,
deleted_at: None,
};
let inserted = self.persistence.insert_timeline(persistence).await?;
if inserted {
tracing::info!("imported timeline into db");
} else {
tracing::info!("didn't import timeline into db, as it is already present in db");
}
Ok(())
}
/// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
pub(super) async fn tenant_timeline_delete_safekeepers(
self: &Arc<Self>,

View File

@@ -14,6 +14,7 @@ import threading
import time
import uuid
from collections import defaultdict
from collections.abc import Mapping
from contextlib import closing, contextmanager
from dataclasses import dataclass
from datetime import datetime
@@ -79,7 +80,12 @@ from fixtures.remote_storage import (
default_remote_storage,
remote_storage_to_toml_dict,
)
from fixtures.safekeeper.http import SafekeeperHttpClient
from fixtures.safekeeper.http import (
MembershipConfiguration,
SafekeeperHttpClient,
SafekeeperId,
TimelineCreateRequest,
)
from fixtures.safekeeper.utils import wait_walreceivers_absent
from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
@@ -1980,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
tenant_shard_id: TenantId | TenantShardId,
pageserver_id: int,
generation_override: int | None = None,
config: None | dict[str, Any] = None,
) -> int:
body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
if generation_override is not None:
body["generation_override"] = generation_override
if config is not None:
body["config"] = config
response = self.request(
"POST",
@@ -2878,13 +2887,14 @@ class NeonPageserver(PgProtocol, LogUtils):
self,
immediate: bool = False,
timeout_in_seconds: int | None = None,
extra_env_vars: dict[str, str] | None = None,
):
"""
High level wrapper for restart: restarts the process, and waits for
tenant state to stabilize.
"""
self.stop(immediate=immediate)
self.start(timeout_in_seconds=timeout_in_seconds)
self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
self.quiesce_tenants()
def quiesce_tenants(self):
@@ -2973,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils):
to call into the pageserver HTTP client.
"""
client = self.http_client()
if generation is None:
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
elif override_storage_controller_generation:
if generation is None or override_storage_controller_generation:
generation = self.env.storage_controller.attach_hook_issue(
tenant_id, self.id, generation
tenant_id,
self.id,
generation_override=generation if override_storage_controller_generation else None,
config=config,
)
return client.tenant_attach(
tenant_id,
@@ -4291,28 +4302,29 @@ class Endpoint(PgProtocol, LogUtils):
def respec_deep(self, **kwargs: Any) -> None:
"""
Update the endpoint.json file taking into account nested keys.
It does one level deep update. Should enough for most cases.
Distinct method from respec() to do not break existing functionality.
Update the spec.json file taking into account nested keys.
Distinct method from respec() to not break existing functionality.
NOTE: This method also updates the spec.json file, not endpoint.json.
We need it because neon_local also writes to spec.json, so intended
use-case is i) start endpoint with some config, ii) respec_deep(),
iii) call reconfigure() to apply the changes.
"""
def update(curr, patch):
for k, v in patch.items():
if isinstance(v, Mapping):
curr[k] = update(curr.get(k, {}), v)
else:
curr[k] = v
return curr
config_path = os.path.join(self.endpoint_path(), "spec.json")
with open(config_path) as f:
data_dict: dict[str, Any] = json.load(f)
log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
for key, value in kwargs.items():
if isinstance(value, dict):
if key not in data_dict:
data_dict[key] = value
else:
data_dict[key] = {**data_dict[key], **value}
else:
data_dict[key] = value
update(data_dict, kwargs)
with open(config_path, "w") as file:
log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
@@ -4839,6 +4851,50 @@ class Safekeeper(LogUtils):
wait_until(paused)
@staticmethod
def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
@staticmethod
def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
"""
List of Safekeepers which are members in `mconf`.
"""
members_ids = [m.id for m in mconf.members]
new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
@staticmethod
def create_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ps: NeonPageserver,
mconf: MembershipConfiguration,
members_sks: list[Safekeeper],
):
"""
Manually create timeline on safekeepers with given (presumably inital)
mconf: figure out LSN from pageserver, bake request and execute it on
given safekeepers.
Normally done by storcon, but some tests want to do it manually so far.
"""
ps_http_cli = ps.http_client()
# figure out initial LSN.
ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
init_lsn = ps_timeline_detail["last_record_lsn"]
log.info(f"initial LSN: {init_lsn}")
# sk timeline creation request expects minor version
pg_version = ps_timeline_detail["pg_version"] * 10000
# create inital mconf
create_r = TimelineCreateRequest(
tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
)
log.info(f"sending timeline create: {create_r.to_json()}")
for sk in members_sks:
sk.http_client().timeline_create(create_r)
class NeonBroker(LogUtils):
"""An object managing storage_broker instance"""

View File

@@ -65,13 +65,11 @@ def single_timeline(
assert ps_http.tenant_list() == []
def attach(tenant):
# NB: create the new tenant in the storage controller with the correct tenant config. This
# will pick up the existing tenant data from remote storage. If we just attach it to the
# Pageserver, the storage controller will reset the tenant config to the default.
env.create_tenant(
tenant_id=tenant,
timeline_id=template_timeline,
conf=template_config,
env.pageserver.tenant_attach(
tenant,
config=template_config,
generation=100,
override_storage_controller_generation=True,
)
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:

View File

@@ -25,7 +25,7 @@ class Walreceiver:
@dataclass
class SafekeeperTimelineStatus:
mconf: Configuration | None
mconf: MembershipConfiguration | None
term: int
last_log_term: int
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:
@dataclass
class Configuration:
class MembershipConfiguration:
generation: int
members: list[SafekeeperId]
new_members: list[SafekeeperId] | None
@classmethod
def from_json(cls, d: dict[str, Any]) -> Configuration:
def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
generation = d["generation"]
members = d["members"]
new_members = d.get("new_members")
return Configuration(generation, members, new_members)
return MembershipConfiguration(generation, members, new_members)
def to_json(self) -> str:
return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ class Configuration:
class TimelineCreateRequest:
tenant_id: TenantId
timeline_id: TimelineId
mconf: Configuration
mconf: MembershipConfiguration
# not exactly PgVersion, for example 150002 for 15.2
pg_version: int
start_lsn: Lsn
@@ -110,13 +110,13 @@ class TimelineCreateRequest:
@dataclass
class TimelineMembershipSwitchResponse:
previous_conf: Configuration
current_conf: Configuration
previous_conf: MembershipConfiguration
current_conf: MembershipConfiguration
@classmethod
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
previous_conf = Configuration.from_json(d["previous_conf"])
current_conf = Configuration.from_json(d["current_conf"])
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
current_conf = MembershipConfiguration.from_json(d["current_conf"])
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
resj = res.json()
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
# It is always normally not None, it is allowed only to make forward compat tests happy.
mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
return SafekeeperTimelineStatus(
mconf=mconf,
term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
return self.timeline_status(tenant_id, timeline_id).commit_lsn
# Get timeline membership configuration.
def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
def get_membership(
self, tenant_id: TenantId, timeline_id: TimelineId
) -> MembershipConfiguration:
# make mypy happy
return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore
@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
return res_json
def timeline_exclude(
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
) -> dict[str, Any]:
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
return res_json
def membership_switch(
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
) -> TimelineMembershipSwitchResponse:
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",

View File

@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(
n_txns = 500000
def setup_wrapper(env: NeonEnv):
return setup_tenant_template(env, n_txns)
env = setup_pageserver_with_tenants(
neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
neon_env_builder,
f"large_slru_count-{n_tenants}-{n_txns}",
n_tenants,
lambda env: setup_tenant_template(env, n_txns),
)
run_benchmark(env, pg_bin, record, duration)
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
"gc_period": "0s", # disable periodic gc
"checkpoint_timeout": "10 years",
"compaction_period": "0s", # disable periodic compaction
"compaction_threshold": 10,
"compaction_target_size": 134217728,
"checkpoint_distance": 268435456,
"image_creation_threshold": 3,
}
template_tenant, template_timeline = env.create_tenant(set_default=True)

View File

@@ -1,5 +1,8 @@
import concurrent.futures
import dataclasses
import json
import re
import threading
import time
from dataclasses import dataclass
from pathlib import Path
@@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
mode: str = "pipelined"
EXECUTION = ["concurrent-futures", "tasks"]
EXECUTION = ["concurrent-futures"]
NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
for max_batch_size in [1, 32]:
for execution in EXECUTION:
NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
for max_batch_size in [1, 2, 4, 8, 16, 32]:
BATCHABLE: list[PageServicePipeliningConfig] = []
for max_batch_size in [32]:
for execution in EXECUTION:
BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
@@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]:
@pytest.mark.parametrize(
"tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
[
# non-batchable workloads
# (A separate benchmark will consider latency).
*[
(
50,
config,
TARGET_RUNTIME,
1,
128,
f"not batchable {dataclasses.asdict(config)}",
)
for config in NON_BATCHABLE
],
# batchable workloads should show throughput and CPU efficiency improvements
*[
(
@@ -137,7 +127,14 @@ def test_throughput(
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
endpoint = env.endpoints.create_start(
"main",
config_lines=[
# minimal lfc & small shared buffers to force requests to pageserver
"neon.max_file_cache_size=1MB",
"shared_buffers=10MB",
],
)
conn = endpoint.connect()
cur = conn.cursor()
@@ -155,7 +152,6 @@ def test_throughput(
tablesize = tablesize_mib * 1024 * 1024
npages = tablesize // (8 * 1024)
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
# TODO: can we force postgres to do sequential scans?
#
# Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -211,31 +207,73 @@ def test_throughput(
).value,
)
def workload() -> Metrics:
def workload(disruptor_started: threading.Event) -> Metrics:
disruptor_started.wait()
start = time.time()
iters = 0
while time.time() - start < target_runtime or iters < 2:
log.info("Seqscan %d", iters)
if iters == 1:
# round zero for warming up
before = get_metrics()
cur.execute(
"select clear_buffer_cache()"
) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
cur.execute("select sum(data::bigint) from t")
assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
iters += 1
after = get_metrics()
return (after - before).normalize(iters - 1)
def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
conn = endpoint.connect()
cur = conn.cursor()
iters = 0
while True:
cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
if stop_disruptor.is_set():
break
disruptor_started.set()
iters += 1
time.sleep(0.001)
return iters
env.pageserver.patch_config_toml_nonrecursive(
{"page_service_pipelining": dataclasses.asdict(pipelining_config)}
)
env.pageserver.restart()
metrics = workload()
# set trace for log analysis below
env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
log.info("Starting workload")
with concurrent.futures.ThreadPoolExecutor() as executor:
disruptor_started = threading.Event()
stop_disruptor = threading.Event()
disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
workload_fut = executor.submit(workload, disruptor_started)
metrics = workload_fut.result()
stop_disruptor.set()
ndisruptions = disruptor_fut.result()
log.info("Disruptor issued %d disrupting requests", ndisruptions)
log.info("Results: %s", metrics)
since_last_start: list[str] = []
for line in env.pageserver.logfile.read_text().splitlines():
if "git:" in line:
since_last_start = []
since_last_start.append(line)
stopping_batching_because_re = re.compile(
r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
)
reasons_for_stopping_batching = {}
for line in since_last_start:
match = stopping_batching_because_re.search(line)
if match:
if match.group(1) not in reasons_for_stopping_batching:
reasons_for_stopping_batching[match.group(1)] = 0
reasons_for_stopping_batching[match.group(1)] += 1
log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
#
# Sanity-checks on the collected data
#

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import copy
import json
import uuid
from typing import TYPE_CHECKING
@@ -16,7 +15,6 @@ if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
"refill_interval": "100ms",
"refill_amount": int(rate_limit_rps / 10),
"max": int(rate_limit_rps / 10),
"fair": True,
},
},
)
@@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
_, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
log.info("run pagebench")
duration_secs = 10
duration_secs = 20
actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
log.info("validate the client is capped at the configured rps limit")
expect_ncompleted = duration_secs * rate_limit_rps
delta_abs = abs(expect_ncompleted - actual_ncompleted)
threshold = 0.05 * expect_ncompleted
assert threshold / rate_limit_rps < 0.1 * duration_secs, (
"test self-test: unrealistic expecations regarding precision in this test"
)
assert delta_abs < 0.05 * expect_ncompleted, (
assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
"the throttling deviates more than 5percent from the expectation"
)
@@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
timeout=compaction_period,
)
log.info("validate the metrics")
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
assert smgr_query_seconds_post is not None
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
actual_throttled_secs = actual_throttled_usecs / 1_000_000
log.info("validate that the metric doesn't include throttle wait time")
assert duration_secs >= 10 * actual_smgr_query_seconds, (
"smgr metrics should not include throttle wait time"
)
log.info("validate that the throttling wait time metrics is correct")
assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
"most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
"throttling and processing latency = total request time; this assert validates thi holds on average"
)
throttle_config_with_field_fair_set = {
"task_kinds": ["PageRequestHandler"],
"fair": True,
"initial": 27,
"refill_interval": "43s",
"refill_amount": 23,
"max": 42,
}
def assert_throttle_config_with_field_fair_set(conf):
"""
Field `fair` is ignored, so, responses don't contain it
"""
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
without_fair.pop("fair")
assert conf == without_fair
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
env = neon_env_builder.init_start()
vps_http = env.storage_controller.pageserver_api()
# with_fair config should still be settable
vps_http.set_tenant_config(
env.initial_tenant,
{"timeline_get_throttle": throttle_config_with_field_fair_set},
)
conf = vps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
assert_throttle_config_with_field_fair_set(
conf.tenant_specific_overrides["timeline_get_throttle"]
)
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
neon_env_builder: NeonEnvBuilder,
):
"""
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
"""
def set_tenant_config(ps_cfg):
tenant_config = ps_cfg.setdefault("tenant_config", {})
tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
neon_env_builder.pageserver_config_override = set_tenant_config
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
conf = ps_http.tenant_config(env.initial_tenant)
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
env.pageserver.allowed_errors.append(
r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
# without this assertion, the test would pass even if the throttling was completely broken
# but the request processing is so slow that it makes up for the latency that a correct throttling
# implementation would add
assert actual_smgr_query_seconds < 0.66 * duration_secs, (
"test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
)

View File

@@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
pageserver.tenant_location_configure(tenant_id, location_conf)
last_state[pageserver.id] = (mode, generation)
if mode.startswith("Attached"):
# It's only valid to connect to the last generation. Newer generations may yank layer
# files used in older generations.
last_generation = max(
[s[1] for s in last_state.values() if s[1] is not None], default=None
)
if mode.startswith("Attached") and generation == last_generation:
# This is a basic test: we are validating that he endpoint works properly _between_
# configuration changes. A stronger test would be to validate that clients see
# no errors while we are making the changes.

View File

@@ -1,5 +1,6 @@
import os
import ssl
from datetime import datetime, timedelta
import pytest
import requests
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
cur_cert = ssl.get_server_certificate(("localhost", port))
assert cur_cert == sk_cert
def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
"""
Test metrics exported from http/https server and tls cert reloader.
"""
neon_env_builder.use_https_pageserver_api = True
neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
ps_client = env.pageserver.http_client()
# 1. Test connection started metric.
filter_https = {"scheme": "https"}
old_https_conn_count = (
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
)
addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
new_https_conn_count = (
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
)
# The counter should increase after the request,
# but it may increase by more than one because of storcon requests.
assert new_https_conn_count > old_https_conn_count
# 2. Test tls connection error.
# Request without specified CA cert file should fail.
with pytest.raises(requests.exceptions.SSLError):
requests.get(addr)
tls_error_cnt = (
ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
)
assert tls_error_cnt == 1
# 3. Test expiration time metric.
expiration_time = datetime.fromtimestamp(
ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
)
now = datetime.now()
# neon_local generates certs valid for 100 years.
# Compare with +-1 year to not care about leap years.
assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
# 4. Test cert reload failed metric.
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
assert reload_error_cnt == 0
os.remove(env.pageserver.workdir / "server.crt")
def reload_failed():
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
assert reload_error_cnt > 0
wait_until(reload_failed)

View File

@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
)
@pytest.mark.parametrize("step_down_times_out", [False, True])
def test_storage_controller_leadership_transfer_during_split(
neon_env_builder: NeonEnvBuilder,
storage_controller_proxy: StorageControllerProxy,
port_distributor: PortDistributor,
step_down_times_out: bool,
):
"""
Exercise a race between shard splitting and graceful leadership transfer. This is
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
)
env.storage_controller.reconcile_until_idle()
# We are testing scenarios where the step down API does not complete: either because it is stuck
# doing a shard split, or because it totally times out on some other failpoint.
env.storage_controller.allowed_errors.extend(
[
".*step_down.*request was dropped before completing.*",
".*step_down.*operation timed out.*",
".*Send step down request failed, will retry.*",
".*Send step down request still failed after.*retries.*",
".*Leader .+ did not respond to step-down request.*",
]
)
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# Start a shard split
env.storage_controller.allowed_errors.extend(
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
)
pause_failpoint = "shard-split-pre-complete"
env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
if not step_down_times_out:
# Prevent the timeout self-terminate code from executing: we will block step down on the
# shard split itself
env.storage_controller.configure_failpoints(
("step-down-delay-timeout", "return(3600000)")
)
split_fut = executor.submit(
env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
)
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
)
if step_down_times_out:
# Step down will time out, original controller will terminate itself
env.storage_controller.allowed_errors.extend([".*terminating process.*"])
else:
# Step down does not time out: original controller hits its shard split completion
# code path and realises that it must not purge the parent shards from the database.
env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
def passed_split_abort():
try:
log.info("Checking log for pattern...")
assert env.storage_controller.log_contains(
".*Using observed state received from leader.*"
)
# This log is indicative of entering startup_reconcile, which happens
# after the point we would abort shard splits
assert env.storage_controller.log_contains(".*Populating tenant shards.*")
except Exception:
log.exception("Failed to find pattern in log")
raise
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
assert env.storage_controller.log_contains(".*Aborting shard split.*")
# Proxy is still talking to original controller here: disable its pause failpoint so
# that its shard split can run to completion.
log.info("Disabling failpoint")
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
# on handling the shard split request.
env.storage_controller.request(
"PUT",
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
if step_down_times_out:
# We will let the old controller hit a timeout path where it terminates itself, rather than
# completing step_down and trying to complete a shard split
def old_controller_terminated():
assert env.storage_controller.log_contains(".*terminating process.*")
def previous_stepped_down():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.STEPPED_DOWN
wait_until(old_controller_terminated)
else:
# Proxy is still talking to original controller here: disable its pause failpoint so
# that its shard split can run to completion.
log.info("Disabling failpoint")
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
# on handling the shard split request.
env.storage_controller.request(
"PUT",
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
headers=env.storage_controller.headers(TokenScope.ADMIN),
)
log.info("Awaiting step down")
wait_until(previous_stepped_down)
def previous_stepped_down():
assert (
env.storage_controller.get_leadership_status()
== StorageControllerLeadershipStatus.STEPPED_DOWN
)
# Let the shard split complete: this may happen _after_ the replacement has come up
# and tried to clean up the databases
log.info("Unblocking & awaiting shard split")
with pytest.raises(Exception, match="Unexpected child shard count"):
# This split fails when it tries to persist results, because it encounters
# changes already made by the new controller's abort-on-startup
split_fut.result()
log.info("Awaiting step down")
wait_until(previous_stepped_down)
# Let the shard split complete: this may happen _after_ the replacement has come up
# and tried to clean up the databases
log.info("Unblocking & awaiting shard split")
with pytest.raises(Exception, match="Unexpected child shard count"):
# This split fails when it tries to persist results, because it encounters
# changes already made by the new controller's abort-on-startup
split_fut.result()
log.info("Routing to new leader")
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
env.storage_controller.wait_until_ready()
env.storage_controller.consistency_check()
# Check that the stepped down instance forwards requests
# to the new leader while it's still running.
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
env.storage_controller.tenant_shard_dump()
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
status = env.storage_controller.node_status(env.pageservers[0].id)
assert status["scheduling"] == "Pause"
if not step_down_times_out:
# Check that the stepped down instance forwards requests
# to the new leader while it's still running.
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
env.storage_controller.tenant_shard_dump()
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
status = env.storage_controller.node_status(env.pageservers[0].id)
assert status["scheduling"] == "Pause"
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):

View File

@@ -45,7 +45,7 @@ from fixtures.remote_storage import (
s3_storage,
)
from fixtures.safekeeper.http import (
Configuration,
MembershipConfiguration,
SafekeeperHttpClient,
SafekeeperId,
TimelineCreateRequest,
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
for sk in env.safekeepers:
sk.start()
cli = sk.http_client()
mconf = Configuration(generation=0, members=[], new_members=None)
mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
# set start_lsn to the beginning of the first segment to allow reading
# WAL from there (could you intidb LSN as well).
r = TimelineCreateRequest(
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock
# Request to switch before timeline creation should fail.
init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
with pytest.raises(requests.exceptions.HTTPError):
http_cli.membership_switch(tenant_id, timeline_id, init_conf)
@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
http_cli.timeline_create(create_r)
# Switch into some conf.
joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
log.info(f"joint switch resp: {resp}")
assert resp.previous_conf.generation == 1
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
assert after_restart.generation == 4
# Switch into non joint conf of which sk is not a member, must fail.
non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
non_joint_not_member = MembershipConfiguration(
generation=5, members=[sk_id_2], new_members=None
)
with pytest.raises(requests.exceptions.HTTPError):
resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)
# Switch into good non joint conf.
non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
log.info(f"non joint switch resp: {resp}")
assert resp.previous_conf.generation == 4
assert resp.current_conf.generation == 6
# Switch request to lower conf should be rejected.
lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
with pytest.raises(requests.exceptions.HTTPError):
http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
# Now, exclude sk from the membership, timeline should be deleted.
excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
with pytest.raises(requests.exceptions.HTTPError):
http_cli.timeline_status(tenant_id, timeline_id)
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
ps = env.pageservers[0]
ps_http_cli = ps.http_client()
http_clis = [sk.http_client() for sk in env.safekeepers]
config_lines = [
"neon.safekeeper_proto_version = 3",
]
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
# expected to fail because timeline is not created on safekeepers
with pytest.raises(Exception, match=r".*timed out.*"):
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
# figure out initial LSN.
ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
init_lsn = ps_timeline_detail["last_record_lsn"]
log.info(f"initial LSN: {init_lsn}")
# sk timeline creation request expects minor version
pg_version = ps_timeline_detail["pg_version"] * 10000
# create inital mconf
sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
mconf = Configuration(generation=1, members=sk_ids, new_members=None)
create_r = TimelineCreateRequest(
tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
mconf = MembershipConfiguration(
generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
)
log.info(f"sending timeline create: {create_r.to_json()}")
for sk_http_cli in http_clis:
sk_http_cli.timeline_create(create_r)
Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
# Once timeline created endpoint should start.
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")

View File

@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
Safekeeper,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.safekeeper.http import MembershipConfiguration
from fixtures.utils import skip_in_debug_build
if TYPE_CHECKING:
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
asyncio.run(run_concurrent_computes(env))
async def assert_query_hangs(endpoint: Endpoint, query: str):
"""
Start on endpoint query which is expected to hang and check that it does.
"""
conn = await endpoint.connect_async()
bg_query = asyncio.create_task(conn.execute(query))
await asyncio.sleep(2)
assert not bg_query.done()
return bg_query
# Stop safekeeper and check that query cannot be executed while safekeeper is down.
# Query will insert a single row into a table.
async def check_unavailability(
sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
):
async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
# shutdown one of two acceptors, that is, majority
sk.stop()
bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
await asyncio.sleep(start_delay_sec)
# ensure that the query has not been executed yet
assert not bg_query.done()
bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
# start safekeeper and await the query
sk.start()
await bg_query
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
await conn.execute("INSERT INTO t values (1, 'payload')")
# stop safekeeper and check that query cannot be executed while safekeeper is down
await check_unavailability(env.safekeepers[0], conn, 2)
await check_unavailability(env.safekeepers[0], endpoint, 2)
# for the world's balance, do the same with second safekeeper
await check_unavailability(env.safekeepers[1], conn, 3)
await check_unavailability(env.safekeepers[1], endpoint, 3)
# check that we can execute queries after restart
await conn.execute("INSERT INTO t values (4, 'payload')")
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
# insert with only one safekeeper up to create tail of flushed but not committed WAL
sk1.stop()
sk2.stop()
conn = await ep.connect_async()
# query should hang, so execute in separate task
bg_query = asyncio.create_task(
conn.execute("insert into t select generate_series(1, 2000), 'payload'")
)
sleep_sec = 2
await asyncio.sleep(sleep_sec)
# it must still be not finished
assert not bg_query.done()
await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
ep.stop_and_destroy()
@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
# insert with only one sk3 up to create tail of flushed but not committed WAL on it
sk1.stop()
sk2.stop()
conn = await ep.connect_async()
# query should hang, so execute in separate task
bg_query = asyncio.create_task(
conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
)
sleep_sec = 2
await asyncio.sleep(sleep_sec)
# it must still be not finished
assert not bg_query.done()
await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
ep.stop_and_destroy()
@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
async def quorum_sanity_single(
env: NeonEnv,
compute_sks_ids: list[int],
members_sks_ids: list[int],
new_members_sks_ids: list[int] | None,
sks_to_stop_ids: list[int],
should_work_when_stopped: bool,
):
"""
*_ids params contain safekeeper node ids; it is assumed they are issued
from 1 and sequentially assigned to env.safekeepers.
"""
members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
new_members_sks = (
[env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
)
sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
mconf = MembershipConfiguration(
generation=1,
members=Safekeeper.sks_to_safekeeper_ids(members_sks),
new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
)
members_sks = Safekeeper.mconf_sks(env, mconf)
tenant_id = env.initial_tenant
compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
new_members_sks_ids_str = "-".join(
[str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
)
sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
log.info(
f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
)
branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
timeline_id = env.create_branch(branch_name)
# create timeline on `members_sks`
Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
config_lines = [
"neon.safekeeper_proto_version = 3",
]
ep = env.endpoints.create(branch_name, config_lines=config_lines)
ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
ep.safe_psql("create table t(key int, value text)")
# stop specified sks and check whether writes work
for sk in sks_to_stop:
sk.stop()
if should_work_when_stopped:
log.info("checking that writes still work")
ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
# restarting ep should also be fine
ep.stop()
ep.start()
ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
bg_query = None
else:
log.info("checking that writes hang")
bg_query = await assert_query_hangs(
ep, "insert into t select generate_series(1, 100), 'Papaya'"
)
# start again; now they should work
for sk in sks_to_stop:
sk.start()
if bg_query:
log.info("awaiting query")
await bg_query
# It's a bit tempting to iterate over all possible combinations, but let's stick
# with this for now.
async def run_quorum_sanity(env: NeonEnv):
# 3 members, all up, should work
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
# 3 members, 2/3 up, should work
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
# 3 members, 1/3 up, should not work
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
# 3 members, all up, should work; wp redundantly talks to 4th.
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
# 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
# 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
# joint conf of 1-2-3 and 4, all up, should work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
# joint conf of 1-2-3 and 4, 4 down, shouldn't work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
# joint conf of 1-2-3 and 2-3-4, all up, should work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
# joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
# joint conf of 1-2-3 and 2-3-4, 2 down, should work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
# joint conf of 1-2-3 and 2-3-4, 3 down, should work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
# joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
# joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
# joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
# with 1 down should still be ok
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
# but with 2 down not ok
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
# Test various combinations of membership configurations / neon.safekeepers
# (list of safekeepers endpoint connects to) values / up & down safekeepers and
# check that endpont can start and write data when we have quorum and can't when
# we don't.
def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 4
env = neon_env_builder.init_start()
asyncio.run(run_quorum_sanity(env))
async def run_segment_init_failure(env: NeonEnv):
env.create_branch("test_segment_init_failure")
ep = env.endpoints.create_start("test_segment_init_failure")