mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 07:00:38 +00:00
Merge 2025-04-11 main commit 'c66444ea1538349d13ab5e87bca880394434004b' into yuchen/direct-io-delta-image-layer-write
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -2837,6 +2837,7 @@ dependencies = [
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-cert",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -523,11 +523,14 @@ impl ComputeNode {
|
||||
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
pspec.tenant_id,
|
||||
pspec.timeline_id,
|
||||
pspec.spec.project_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.branch_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.features,
|
||||
pspec.spec.remote_extensions,
|
||||
);
|
||||
@@ -631,19 +634,23 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for HIPAA if necessary
|
||||
if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
|
||||
let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
// Configure and start rsyslog for compliance audit logging
|
||||
match pspec.spec.audit_log_level {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
let remote_endpoint =
|
||||
std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
|
||||
if remote_endpoint.is_empty() {
|
||||
anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
}
|
||||
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
let log_directory_path = log_directory_path.to_string_lossy().to_string();
|
||||
configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
|
||||
|
||||
// Launch a background task to clean up the audit logs
|
||||
launch_pgaudit_gc(log_directory_path);
|
||||
_ => {}
|
||||
}
|
||||
|
||||
// Configure and start rsyslog for Postgres logs export
|
||||
|
||||
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
|
||||
escape_conf_value(&s.to_string())
|
||||
)?;
|
||||
}
|
||||
if let Some(s) = &spec.project_id {
|
||||
writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.branch_id {
|
||||
writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
if let Some(s) = &spec.endpoint_id {
|
||||
writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
|
||||
}
|
||||
|
||||
// tls
|
||||
if let Some(tls_config) = tls_config {
|
||||
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
|
||||
// and don't allow the user or the control plane admin to change them.
|
||||
match spec.audit_log_level {
|
||||
ComputeAudit::Disabled => {}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
|
||||
writeln!(file, "pgaudit.log='ddl,role'")?;
|
||||
// Disable logging of catalog queries to reduce the noise
|
||||
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
|
||||
}
|
||||
writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
|
||||
}
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
writeln!(
|
||||
file,
|
||||
"# Managed by compute_ctl compliance audit settings: begin"
|
||||
)?;
|
||||
// This log level is very verbose
|
||||
// but this is necessary for HIPAA compliance.
|
||||
// Exclude 'misc' category, because it doesn't contain anythig relevant.
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
// Enable logging of parameters.
|
||||
// This is very verbose and may contain sensitive data.
|
||||
if spec.audit_log_level == ComputeAudit::Full {
|
||||
writeln!(file, "pgaudit.log_parameter=on")?;
|
||||
writeln!(file, "pgaudit.log='all'")?;
|
||||
} else {
|
||||
writeln!(file, "pgaudit.log_parameter=off")?;
|
||||
writeln!(file, "pgaudit.log='all, -misc'")?;
|
||||
}
|
||||
// Disable logging of catalog queries
|
||||
// The catalog doesn't contain sensitive data, so we don't need to audit it.
|
||||
writeln!(file, "pgaudit.log_catalog=off")?;
|
||||
|
||||
@@ -278,12 +278,12 @@ impl ComputeNode {
|
||||
// so that all config operations are audit logged.
|
||||
match spec.audit_log_level
|
||||
{
|
||||
ComputeAudit::Hipaa => {
|
||||
ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(CreatePgauditlogtofileExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
ComputeAudit::Log => {
|
||||
ComputeAudit::Log | ComputeAudit::Base => {
|
||||
phases.push(CreatePgauditExtension);
|
||||
phases.push(DisablePostgresDBPgAudit);
|
||||
}
|
||||
|
||||
@@ -658,6 +658,9 @@ impl Endpoint {
|
||||
delta_operations: None,
|
||||
tenant_id: Some(self.tenant_id),
|
||||
timeline_id: Some(self.timeline_id),
|
||||
project_id: None,
|
||||
branch_id: None,
|
||||
endpoint_id: Some(self.endpoint_id.clone()),
|
||||
mode: self.mode,
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
|
||||
|
||||
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
};
|
||||
use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::models::{
|
||||
TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
pub generation_override: Option<i32>,
|
||||
pub generation_override: Option<i32>, // only new tenants
|
||||
pub config: Option<TenantConfig>, // only new tenants
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -805,6 +808,7 @@ impl StorageController {
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
generation_override: None,
|
||||
config: None,
|
||||
};
|
||||
|
||||
let response = self
|
||||
|
||||
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
// More neon ids that we expose to the compute_ctl
|
||||
// and to postgres as neon extension GUCs.
|
||||
pub project_id: Option<String>,
|
||||
pub branch_id: Option<String>,
|
||||
pub endpoint_id: Option<String>,
|
||||
|
||||
/// Safekeeper membership config generation. It is put in
|
||||
/// neon.safekeepers GUC and serves two purposes:
|
||||
/// 1) Non zero value forces walproposer to use membership configurations.
|
||||
@@ -159,13 +165,7 @@ pub struct ComputeSpec {
|
||||
#[serde(default)] // Default false
|
||||
pub drop_subscriptions_before_start: bool,
|
||||
|
||||
/// Log level for audit logging:
|
||||
///
|
||||
/// Disabled - no audit logging. This is the default.
|
||||
/// log - log masked statements to the postgres log using pgaudit extension
|
||||
/// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
|
||||
///
|
||||
/// Extensions should be present in shared_preload_libraries
|
||||
/// Log level for compute audit logging
|
||||
#[serde(default)]
|
||||
pub audit_log_level: ComputeAudit,
|
||||
|
||||
@@ -289,14 +289,25 @@ impl ComputeMode {
|
||||
}
|
||||
|
||||
/// Log level for audit logging
|
||||
/// Disabled, log, hipaa
|
||||
/// Default is Disabled
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeAudit {
|
||||
#[default]
|
||||
Disabled,
|
||||
// Deprecated, use Base instead
|
||||
Log,
|
||||
// (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
|
||||
// logged to the standard postgresql log stream
|
||||
Base,
|
||||
// Deprecated, use Full or Extended instead
|
||||
Hipaa,
|
||||
// (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access
|
||||
Extended,
|
||||
// (pgaudit.log='all', pgaudit.log_parameter='on'),
|
||||
// logged to separate files collected by rsyslog
|
||||
// into dedicated log storage with strict access.
|
||||
Full,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
|
||||
|
||||
@@ -30,6 +30,7 @@ tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
x509-cert.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
|
||||
@@ -4,6 +4,8 @@ use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use hyper0::Body;
|
||||
use hyper0::server::conn::Http;
|
||||
use metrics::{IntCounterVec, register_int_counter_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::{RequestService, RequestServiceBuilder};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
@@ -26,6 +28,24 @@ pub struct Server {
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
}
|
||||
|
||||
static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_started_total",
|
||||
"Number of established http/https connections",
|
||||
&["scheme"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"http_server_connection_errors_total",
|
||||
"Number of occured connection errors by type",
|
||||
&["type"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl Server {
|
||||
pub fn new(
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
@@ -60,6 +80,15 @@ impl Server {
|
||||
false
|
||||
}
|
||||
|
||||
let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
|
||||
let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
|
||||
let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
|
||||
let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
|
||||
let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
|
||||
|
||||
let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
|
||||
let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
|
||||
|
||||
let mut connections = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -67,6 +96,7 @@ impl Server {
|
||||
let (tcp_stream, remote_addr) = match stream {
|
||||
Ok(stream) => stream,
|
||||
Err(err) => {
|
||||
tcp_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TCP connection: {err:#}");
|
||||
}
|
||||
@@ -78,11 +108,18 @@ impl Server {
|
||||
let tls_acceptor = self.tls_acceptor.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
let tls_error_cnt = tls_error_cnt.clone();
|
||||
let http_error_cnt = http_error_cnt.clone();
|
||||
let https_error_cnt = https_error_cnt.clone();
|
||||
let http_connection_cnt = http_connection_cnt.clone();
|
||||
let https_connection_cnt = https_connection_cnt.clone();
|
||||
|
||||
connections.push(tokio::spawn(
|
||||
async move {
|
||||
match tls_acceptor {
|
||||
Some(tls_acceptor) => {
|
||||
// Handle HTTPS connection.
|
||||
https_connection_cnt.inc();
|
||||
let tls_stream = tokio::select! {
|
||||
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
|
||||
_ = cancel.cancelled() => return,
|
||||
@@ -90,6 +127,7 @@ impl Server {
|
||||
let tls_stream = match tls_stream {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
tls_error_cnt.inc();
|
||||
if !suppress_io_error(&err) {
|
||||
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
@@ -97,6 +135,7 @@ impl Server {
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
https_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
@@ -104,7 +143,9 @@ impl Server {
|
||||
}
|
||||
None => {
|
||||
// Handle HTTP connection.
|
||||
http_connection_cnt.inc();
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
http_error_cnt.inc();
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
@@ -115,6 +156,7 @@ impl Server {
|
||||
}
|
||||
Some(conn) = connections.next() => {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
@@ -122,6 +164,7 @@ impl Server {
|
||||
// Wait for graceful shutdown of all connections.
|
||||
while let Some(conn) = connections.next().await {
|
||||
if let Err(err) = conn {
|
||||
panic_error_cnt.inc();
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwap;
|
||||
use camino::Utf8Path;
|
||||
use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
|
||||
use once_cell::sync::Lazy;
|
||||
use rustls::{
|
||||
pki_types::{CertificateDer, PrivateKeyDer},
|
||||
pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
|
||||
server::{ClientHello, ResolvesServerCert},
|
||||
sign::CertifiedKey,
|
||||
};
|
||||
use x509_cert::der::Reader;
|
||||
|
||||
pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
|
||||
let cert_data = tokio::fs::read(filename)
|
||||
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
|
||||
Ok(certified_key)
|
||||
}
|
||||
|
||||
/// rustls's CertifiedKey with extra parsed fields used for metrics.
|
||||
struct ParsedCertifiedKey {
|
||||
certified_key: CertifiedKey,
|
||||
expiration_time: UnixTime,
|
||||
}
|
||||
|
||||
/// Parse expiration time from an X509 certificate.
|
||||
fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
|
||||
let parsed_cert = x509_cert::der::SliceReader::new(cert)
|
||||
.context("Failed to parse cerficiate")?
|
||||
.decode::<x509_cert::Certificate>()
|
||||
.context("Failed to parse cerficiate")?;
|
||||
|
||||
Ok(UnixTime::since_unix_epoch(
|
||||
parsed_cert
|
||||
.tbs_certificate
|
||||
.validity
|
||||
.not_after
|
||||
.to_unix_duration(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn load_and_parse_certified_key(
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
) -> anyhow::Result<ParsedCertifiedKey> {
|
||||
let certified_key = load_certified_key(key_filename, cert_filename).await?;
|
||||
let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
|
||||
Ok(ParsedCertifiedKey {
|
||||
certified_key,
|
||||
expiration_time,
|
||||
})
|
||||
}
|
||||
|
||||
static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"tls_certs_expiration_time_seconds",
|
||||
"Expiration time of the loaded certificate since unix epoch in seconds",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_started_total",
|
||||
"Number of certificate reload loop iterations started",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_updated_total",
|
||||
"Number of times the certificate was updated to the new one",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"tls_certs_reload_failed_total",
|
||||
"Number of times the certificate reload failed",
|
||||
&["resolver_name"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
/// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
|
||||
/// the disk periodically.
|
||||
#[derive(Debug)]
|
||||
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
|
||||
impl ReloadingCertificateResolver {
|
||||
/// Creates a new Resolver by loading certificate and private key from FS and
|
||||
/// creating tokio::task to reload them with provided reload_period.
|
||||
/// resolver_name is used as metric's label.
|
||||
pub async fn new(
|
||||
resolver_name: &str,
|
||||
key_filename: &Utf8Path,
|
||||
cert_filename: &Utf8Path,
|
||||
reload_period: Duration,
|
||||
) -> anyhow::Result<Arc<Self>> {
|
||||
// Create metrics for current resolver.
|
||||
let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
|
||||
let cert_reload_started_counter =
|
||||
CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_updated_counter =
|
||||
CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
|
||||
let cert_reload_failed_counter =
|
||||
CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
|
||||
|
||||
let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
|
||||
|
||||
let this = Arc::new(Self {
|
||||
certified_key: ArcSwap::from_pointee(
|
||||
load_certified_key(key_filename, cert_filename).await?,
|
||||
),
|
||||
certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
|
||||
});
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
|
||||
tokio::spawn({
|
||||
let weak_this = Arc::downgrade(&this);
|
||||
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
|
||||
Some(this) => this,
|
||||
None => break, // Resolver has been destroyed, exit.
|
||||
};
|
||||
match load_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(new_certified_key) => {
|
||||
if new_certified_key.cert == this.certified_key.load().cert {
|
||||
cert_reload_started_counter.inc();
|
||||
|
||||
match load_and_parse_certified_key(&key_filename, &cert_filename).await {
|
||||
Ok(parsed_key) => {
|
||||
if parsed_key.certified_key.cert == this.certified_key.load().cert {
|
||||
tracing::debug!("Certificate has not changed since last reloading");
|
||||
} else {
|
||||
tracing::info!("Certificate has been reloaded");
|
||||
this.certified_key.store(Arc::new(new_certified_key));
|
||||
this.certified_key.store(Arc::new(parsed_key.certified_key));
|
||||
cert_expiration_time.set(parsed_key.expiration_time.as_secs());
|
||||
cert_reload_updated_counter.inc();
|
||||
}
|
||||
last_reload_failed = false;
|
||||
}
|
||||
Err(err) => {
|
||||
cert_reload_failed_counter.inc();
|
||||
// Note: Reloading certs may fail if it conflicts with the script updating
|
||||
// the files at the same time. Warn only if the error is persistent.
|
||||
if last_reload_failed {
|
||||
|
||||
@@ -180,6 +180,7 @@ pub struct ConfigToml {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generate_unarchival_heatmap: Option<bool>,
|
||||
pub tracing: Option<Tracing>,
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -631,6 +632,7 @@ impl Default for ConfigToml {
|
||||
load_previous_heatmap: None,
|
||||
generate_unarchival_heatmap: None,
|
||||
tracing: None,
|
||||
enable_tls_page_service_api: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
|
||||
/// API (`/control/v1` prefix). Implemented by the server
|
||||
/// in [`storage_controller::http`]
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
|
||||
use crate::shard::{ShardStripeSize, TenantShardId};
|
||||
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
|
||||
/// Import request for safekeeper timelines.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineImportRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub start_lsn: Lsn,
|
||||
pub sk_set: Vec<NodeId>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use serde_json;
|
||||
|
||||
@@ -927,7 +927,7 @@ impl Key {
|
||||
|
||||
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
|
||||
#[inline(always)]
|
||||
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
|
||||
Ok(match self.field1 {
|
||||
0x00 => (
|
||||
RelTag {
|
||||
@@ -938,7 +938,7 @@ impl Key {
|
||||
},
|
||||
self.field6,
|
||||
),
|
||||
_ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
|
||||
_ => return Err(ToRelBlockError(self.field1)),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ToRelBlockError(u8);
|
||||
|
||||
impl fmt::Display for ToRelBlockError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "unexpected value kind 0x{:02x}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ToRelBlockError {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -452,6 +452,24 @@ fn start_pageserver(
|
||||
info!("Using auth for http API: {:#?}", conf.http_auth_type);
|
||||
info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
|
||||
|
||||
let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
|
||||
{
|
||||
let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
Some(Arc::new(server_config))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
@@ -670,17 +688,11 @@ fn start_pageserver(
|
||||
|
||||
let https_task = match https_listener {
|
||||
Some(https_listener) => {
|
||||
let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
))?;
|
||||
let tls_server_config = tls_server_config
|
||||
.clone()
|
||||
.expect("tls_server_config is set earlier if https is enabled");
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_cert_resolver(resolver);
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
|
||||
|
||||
let server =
|
||||
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
|
||||
@@ -736,6 +748,11 @@ fn start_pageserver(
|
||||
tokio::net::TcpListener::from_std(pageserver_listener)
|
||||
.context("create tokio listener")?
|
||||
},
|
||||
if conf.enable_tls_page_service_api {
|
||||
tls_server_config
|
||||
} else {
|
||||
None
|
||||
},
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
|
||||
@@ -219,6 +219,11 @@ pub struct PageServerConf {
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
|
||||
pub tracing: Option<pageserver_api::config::Tracing>,
|
||||
|
||||
/// Enable TLS in page service API.
|
||||
/// Does not force TLS: the client negotiates TLS usage during the handshake.
|
||||
/// Uses key and certificate from ssl_key_file/ssl_cert_file.
|
||||
pub enable_tls_page_service_api: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -391,6 +396,7 @@ impl PageServerConf {
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -441,6 +447,7 @@ impl PageServerConf {
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
enable_tls_page_service_api,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
|
||||
@@ -3253,7 +3253,7 @@ async fn ingest_aux_files(
|
||||
modification
|
||||
.put_file(&fname, content.as_bytes(), &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
}
|
||||
modification
|
||||
.commit(&ctx)
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walingest::{WalIngest, WalIngestErrorKind};
|
||||
|
||||
// Returns checkpoint LSN from controlfile
|
||||
pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
|
||||
@@ -157,9 +157,9 @@ async fn import_rel(
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
RelationError::AlreadyExists => {
|
||||
debug!("Relation {} already exist. We must be extending it.", rel)
|
||||
match e.kind {
|
||||
WalIngestErrorKind::RelationAlreadyExists(rel) => {
|
||||
debug!("Relation {rel} already exists. We must be extending it.")
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
|
||||
@@ -105,6 +105,7 @@ pub fn spawn(
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -124,6 +125,7 @@ pub fn spawn(
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
|
||||
local_auth,
|
||||
socket,
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
socket: tokio::net::TcpStream,
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
|
||||
let pgbackend =
|
||||
PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
|
||||
|
||||
match pgbackend.run(&mut conn_handler, &cancel).await {
|
||||
Ok(()) => {
|
||||
|
||||
@@ -9,8 +9,9 @@
|
||||
use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
|
||||
use std::ops::{ControlFlow, Range};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, ensure};
|
||||
use crate::walingest::{WalIngestError, WalIngestErrorKind};
|
||||
use crate::{PERF_TRACE_TARGET, ensure_walingest};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use itertools::Itertools;
|
||||
@@ -136,12 +137,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
AlreadyExists,
|
||||
#[error("invalid relnode")]
|
||||
InvalidRelnode,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
@@ -1478,8 +1475,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
ensure!(
|
||||
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(
|
||||
lsn >= self.lsn,
|
||||
"setting an older lsn {} than {} is not allowed",
|
||||
lsn,
|
||||
@@ -1578,7 +1575,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u32, PageReconstructError> {
|
||||
) -> Result<u32, WalIngestError> {
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
@@ -1593,14 +1590,13 @@ impl DatadirModification<'_> {
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
self.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(0)
|
||||
} else {
|
||||
self.tline
|
||||
Ok(self
|
||||
.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1637,11 +1633,14 @@ impl DatadirModification<'_> {
|
||||
// TODO(vlad): remove this argument and replace the shard check with is_key_local
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
|
||||
let key = Key::from_compact(meta.key());
|
||||
let (rel, blkno) = key
|
||||
.to_rel_block()
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
|
||||
let new_nblocks = blkno + 1;
|
||||
|
||||
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
|
||||
@@ -1683,8 +1682,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1696,7 +1695,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.tline.tenant_shard_id.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1714,14 +1713,11 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1733,15 +1729,12 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver at {}",
|
||||
key
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
self.put(key, Value::Image(img));
|
||||
Ok(())
|
||||
@@ -1751,15 +1744,11 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let key = rel_block_to_key(rel, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1776,15 +1765,11 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!(
|
||||
"the request contains data not supported by pageserver: {} @ {}",
|
||||
key,
|
||||
self.lsn
|
||||
);
|
||||
Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
}
|
||||
|
||||
let batch = self
|
||||
@@ -1832,8 +1817,10 @@ impl DatadirModification<'_> {
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY, ctx).await?;
|
||||
@@ -1874,13 +1861,13 @@ impl DatadirModification<'_> {
|
||||
xid: u64,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Add it to the directory entry
|
||||
let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1891,7 +1878,7 @@ impl DatadirModification<'_> {
|
||||
let xid = xid as u32;
|
||||
let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::TwoPhase,
|
||||
@@ -1909,22 +1896,22 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
origin_id: RepOriginId,
|
||||
origin_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = repl_origin_key(origin_id);
|
||||
self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
|
||||
pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
|
||||
self.set_replorigin(origin_id, Lsn::INVALID).await
|
||||
}
|
||||
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CONTROLFILE_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
|
||||
pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
|
||||
self.put(CHECKPOINT_KEY, Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -1934,7 +1921,7 @@ impl DatadirModification<'_> {
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
|
||||
@@ -1973,20 +1960,21 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RelationError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::InvalidRelnode);
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)))?;
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
|
||||
let dbdir_exists =
|
||||
if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
|
||||
// Didn't exist. Update dbdir
|
||||
e.insert(false);
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::Db,
|
||||
MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
|
||||
@@ -2003,27 +1991,25 @@ impl DatadirModification<'_> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
};
|
||||
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
|
||||
if v2_enabled {
|
||||
if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
let sparse_rel_dir_key =
|
||||
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
|
||||
// check if the rel_dir_key exists in v2
|
||||
let val = self
|
||||
.sparse_get(sparse_rel_dir_key, ctx)
|
||||
.await
|
||||
.map_err(|e| RelationError::Other(e.into()))?;
|
||||
let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
|
||||
let val = RelDirExists::decode_option(val)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
|
||||
if val == RelDirExists::Exists {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
self.put(
|
||||
sparse_rel_dir_key,
|
||||
@@ -2039,9 +2025,7 @@ impl DatadirModification<'_> {
|
||||
// will be key not found errors if we don't create an empty one for rel_size_v2.
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
|
||||
);
|
||||
}
|
||||
self.pending_directory_entries
|
||||
@@ -2049,7 +2033,7 @@ impl DatadirModification<'_> {
|
||||
} else {
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
|
||||
}
|
||||
if !dbdir_exists {
|
||||
self.pending_directory_entries
|
||||
@@ -2059,9 +2043,7 @@ impl DatadirModification<'_> {
|
||||
.push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&rel_dir).context("serialize")?,
|
||||
)),
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2086,8 +2068,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
@@ -2117,8 +2099,8 @@ impl DatadirModification<'_> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
) -> Result<(), WalIngestError> {
|
||||
ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -2142,8 +2124,10 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let v2_enabled = self.maybe_enable_rel_size_v2()?;
|
||||
) -> Result<(), WalIngestError> {
|
||||
let v2_enabled = self
|
||||
.maybe_enable_rel_size_v2()
|
||||
.map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
|
||||
for ((spc_node, db_node), rel_tags) in drop_relations {
|
||||
let dir_key = rel_dir_to_key(spc_node, db_node);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2163,7 +2147,7 @@ impl DatadirModification<'_> {
|
||||
let key =
|
||||
rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
|
||||
let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
|
||||
.map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
|
||||
.map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
|
||||
if val == RelDirExists::Exists {
|
||||
self.pending_directory_entries
|
||||
.push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
|
||||
@@ -2206,7 +2190,7 @@ impl DatadirModification<'_> {
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Add it to the directory entry
|
||||
@@ -2215,7 +2199,7 @@ impl DatadirModification<'_> {
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||
Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
|
||||
}
|
||||
self.pending_directory_entries.push((
|
||||
DirectoryKind::SlruSegment(kind),
|
||||
@@ -2242,7 +2226,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
assert!(self.tline.tenant_shard_id.is_shard_zero());
|
||||
|
||||
// Put size
|
||||
@@ -2258,7 +2242,7 @@ impl DatadirModification<'_> {
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key, ctx).await?;
|
||||
@@ -2283,7 +2267,7 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
|
||||
/// Drop a relmapper file (pg_filenode.map)
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
|
||||
pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
|
||||
// TODO
|
||||
Ok(())
|
||||
}
|
||||
@@ -2293,7 +2277,7 @@ impl DatadirModification<'_> {
|
||||
&mut self,
|
||||
xid: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
|
||||
let newdirbuf = if self.tline.pg_version >= 17 {
|
||||
@@ -2308,7 +2292,8 @@ impl DatadirModification<'_> {
|
||||
));
|
||||
Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
|
||||
} else {
|
||||
let xid: u32 = u32::try_from(xid)?;
|
||||
let xid: u32 = u32::try_from(xid)
|
||||
.map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -2333,7 +2318,7 @@ impl DatadirModification<'_> {
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let key = aux_file::encode_aux_file_key(path);
|
||||
// retrieve the key from the engine
|
||||
let old_val = match self.get(key, ctx).await {
|
||||
@@ -2342,7 +2327,7 @@ impl DatadirModification<'_> {
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
|
||||
aux_file::decode_file_value(old_val)?
|
||||
aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
@@ -2387,7 +2372,8 @@ impl DatadirModification<'_> {
|
||||
}
|
||||
(None, true) => warn!("removing non-existing aux file: {}", path),
|
||||
}
|
||||
let new_val = aux_file::encode_file_value(&new_files)?;
|
||||
let new_val = aux_file::encode_file_value(&new_files)
|
||||
.map_err(WalIngestErrorKind::EncodeAuxFileError)?;
|
||||
self.put(key, Value::Image(new_val.into()));
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -11571,6 +11571,99 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x24),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
// This record will fail to redo
|
||||
Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![], // in-memory layers
|
||||
vec![DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
delta1,
|
||||
)], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.applied_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Compaction will fail, but should not fire any critical error.
|
||||
// Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
|
||||
// process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
|
||||
// compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
|
||||
let res = tline
|
||||
.compact_with_gc(
|
||||
&cancel,
|
||||
CompactOptions {
|
||||
compact_key_range: None,
|
||||
compact_lsn_range: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
assert!(res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
|
||||
|
||||
@@ -1269,7 +1269,7 @@ mod test {
|
||||
|
||||
// This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
|
||||
let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
|
||||
let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
|
||||
let range = input_start..input_end;
|
||||
|
||||
// Build an image layer to filter
|
||||
@@ -1314,7 +1314,7 @@ mod test {
|
||||
let shard_identity = ShardIdentity::new(
|
||||
ShardNumber(shard_number),
|
||||
shard_count,
|
||||
ShardStripeSize(0x8000),
|
||||
ShardStripeSize(0x800),
|
||||
)
|
||||
.unwrap();
|
||||
let harness = TenantHarness::create_custom(
|
||||
@@ -1368,12 +1368,12 @@ mod test {
|
||||
|
||||
// This exact size and those below will need updating as/when the layer encoding changes, but
|
||||
// should be deterministic for a given version of the format, as we used no randomness generating the input.
|
||||
assert_eq!(original_size, 1597440);
|
||||
assert_eq!(original_size, 122880);
|
||||
|
||||
match shard_number {
|
||||
0 => {
|
||||
// We should have written out just one stripe for our shard identity
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
|
||||
// We should have dropped some of the data
|
||||
@@ -1381,7 +1381,7 @@ mod test {
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
|
||||
// Assert that we dropped ~3/4 of the data.
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
1 => {
|
||||
// Shard 1 has no keys in our input range
|
||||
@@ -1390,19 +1390,19 @@ mod test {
|
||||
}
|
||||
2 => {
|
||||
// Shard 2 has one stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x8000);
|
||||
assert_eq!(wrote_keys, 0x800);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 417792);
|
||||
assert_eq!(replacement.metadata().file_size, 49152);
|
||||
}
|
||||
3 => {
|
||||
// Shard 3 has two stripes in the input range
|
||||
assert_eq!(wrote_keys, 0x10000);
|
||||
assert_eq!(wrote_keys, 0x1000);
|
||||
let replacement = replacement.unwrap();
|
||||
assert!(replacement.metadata().file_size < original_size);
|
||||
assert!(replacement.metadata().file_size > 0);
|
||||
assert_eq!(replacement.metadata().file_size, 811008);
|
||||
assert_eq!(replacement.metadata().file_size, 73728);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
@@ -1141,6 +1141,7 @@ impl Timeline {
|
||||
) -> Result<(), CompactionError> {
|
||||
let mut drop_layers = Vec::new();
|
||||
let mut layers_to_rewrite: Vec<Layer> = Vec::new();
|
||||
let mut rewrite_max_exceeded: bool = false;
|
||||
|
||||
// We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a
|
||||
// layer is behind this Lsn, it indicates that the layer is being retained beyond the
|
||||
@@ -1149,12 +1150,7 @@ impl Timeline {
|
||||
// Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
|
||||
// are rewriting layers.
|
||||
let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
|
||||
|
||||
tracing::info!(
|
||||
"starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
|
||||
*latest_gc_cutoff,
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
);
|
||||
let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let layers = self.layers.read().await;
|
||||
for layer_desc in layers.layer_map()?.iter_historic_layers() {
|
||||
@@ -1172,8 +1168,8 @@ impl Timeline {
|
||||
// This ancestral layer only covers keys that belong to other shards.
|
||||
// We include the full metadata in the log: if we had some critical bug that caused
|
||||
// us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard.",
|
||||
debug!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split, contains no keys for this shard",
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
@@ -1235,9 +1231,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if layers_to_rewrite.len() >= rewrite_max {
|
||||
tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
|
||||
layers_to_rewrite.len()
|
||||
);
|
||||
rewrite_max_exceeded = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1245,9 +1242,24 @@ impl Timeline {
|
||||
layers_to_rewrite.push(layer);
|
||||
}
|
||||
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O
|
||||
// Drop read lock on layer map before we start doing time-consuming I/O.
|
||||
drop(layers);
|
||||
|
||||
// Drop out early if there's nothing to do.
|
||||
if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
|
||||
(latest_gc_cutoff={} pitr_cutoff={})",
|
||||
layers_to_rewrite.len(),
|
||||
drop_layers.len(),
|
||||
*latest_gc_cutoff,
|
||||
pitr_cutoff,
|
||||
);
|
||||
let started = Instant::now();
|
||||
|
||||
let mut replace_image_layers = Vec::new();
|
||||
|
||||
for layer in layers_to_rewrite {
|
||||
@@ -1255,7 +1267,7 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
tracing::info!(layer=%layer, "Rewriting layer after shard split...");
|
||||
info!(layer=%layer, "rewriting layer after shard split");
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
@@ -1295,7 +1307,7 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
|
||||
.map_err(CompactionError::Other)?;
|
||||
tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
|
||||
info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
|
||||
layer.metadata().file_size,
|
||||
new_layer.metadata().file_size);
|
||||
|
||||
@@ -1307,6 +1319,12 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
for layer in &drop_layers {
|
||||
info!(%layer, old_metadata=?layer.metadata(),
|
||||
"dropping layer after shard split (no keys for this shard)",
|
||||
);
|
||||
}
|
||||
|
||||
// At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
|
||||
// metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch
|
||||
// to remote index) and be removed. This is inefficient but safe.
|
||||
@@ -1322,6 +1340,7 @@ impl Timeline {
|
||||
// necessary for correctness, but it simplifies testing, and avoids proceeding with another
|
||||
// Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
|
||||
// load.
|
||||
info!("shard ancestor compaction waiting for uploads");
|
||||
match self.remote_client.wait_completion().await {
|
||||
Ok(()) => (),
|
||||
Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
|
||||
@@ -1330,6 +1349,15 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"shard ancestor compaction done in {:.3}s{}",
|
||||
started.elapsed().as_secs_f64(),
|
||||
match rewrite_max_exceeded {
|
||||
true => format!(", more work pending due to rewrite_max={rewrite_max}"),
|
||||
false => String::new(),
|
||||
}
|
||||
);
|
||||
|
||||
fail::fail_point!("compact-shard-ancestors-persistent");
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
WalReceiverError::Cancelled => Ok(()),
|
||||
WalReceiverError::Other(e) => {
|
||||
// give out an error to have task_mgr give it a really verbose logging
|
||||
if cancellation.is_cancelled() {
|
||||
|
||||
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
|
||||
/// Generic error
|
||||
Other(anyhow::Error),
|
||||
ClosedGate,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<tokio_postgres::Error> for WalReceiverError {
|
||||
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// with a similar error.
|
||||
},
|
||||
WalReceiverError::SuccessfulCompletion(_) => {}
|
||||
WalReceiverError::Cancelled => {
|
||||
debug!("Connection cancelled")
|
||||
}
|
||||
WalReceiverError::ClosedGate => {
|
||||
// doesn't happen at runtime
|
||||
}
|
||||
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
|
||||
@@ -21,13 +21,13 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use std::backtrace::Backtrace;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use anyhow::{Result, bail};
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::key::{Key, rel_block_to_key};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
@@ -38,7 +38,7 @@ use postgres_ffi::{
|
||||
fsm_logical_to_physical, pg_constants,
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::bin_ser::{DeserializeError, SerializeError};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{critical, failpoint_support};
|
||||
@@ -104,12 +104,101 @@ struct WarnIngestLag {
|
||||
timestamp_invalid_msg_ratelimit: RateLimit,
|
||||
}
|
||||
|
||||
pub struct WalIngestError {
|
||||
pub backtrace: std::backtrace::Backtrace,
|
||||
pub kind: WalIngestErrorKind,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum WalIngestErrorKind {
|
||||
#[error(transparent)]
|
||||
#[allow(private_interfaces)]
|
||||
PageReconstructError(#[from] PageReconstructError),
|
||||
#[error(transparent)]
|
||||
DeserializationFailure(#[from] DeserializeError),
|
||||
#[error(transparent)]
|
||||
SerializationFailure(#[from] SerializeError),
|
||||
#[error("the request contains data not supported by pageserver: {0} @ {1}")]
|
||||
InvalidKey(Key, Lsn),
|
||||
#[error("twophase file for xid {0} already exists")]
|
||||
FileAlreadyExists(u64),
|
||||
#[error("slru segment {0:?}/{1} already exists")]
|
||||
SlruAlreadyExists(SlruKind, u32),
|
||||
#[error("relation already exists")]
|
||||
RelationAlreadyExists(RelTag),
|
||||
#[error("invalid reldir key {0}")]
|
||||
InvalidRelDirKey(Key),
|
||||
|
||||
#[error(transparent)]
|
||||
LogicalError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
EncodeAuxFileError(anyhow::Error),
|
||||
#[error(transparent)]
|
||||
MaybeRelSizeV2Error(anyhow::Error),
|
||||
|
||||
#[error("timeline shutting down")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl<T> From<T> for WalIngestError
|
||||
where
|
||||
WalIngestErrorKind: From<T>,
|
||||
{
|
||||
fn from(value: T) -> Self {
|
||||
WalIngestError {
|
||||
backtrace: Backtrace::capture(),
|
||||
kind: WalIngestErrorKind::from(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for WalIngestError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
self.kind.source()
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
self.kind.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for WalIngestError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
if f.alternate() {
|
||||
f.debug_map()
|
||||
.key(&"backtrace")
|
||||
.value(&self.backtrace)
|
||||
.key(&"kind")
|
||||
.value(&self.kind)
|
||||
.finish()
|
||||
} else {
|
||||
writeln!(f, "Error: {:?}", self.kind)?;
|
||||
if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
|
||||
writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! ensure_walingest {
|
||||
($($t:tt)*) => {
|
||||
_ = || -> Result<(), anyhow::Error> {
|
||||
anyhow::ensure!($($t)*);
|
||||
Ok(())
|
||||
}().map_err(WalIngestErrorKind::LogicalError)?;
|
||||
};
|
||||
}
|
||||
|
||||
impl WalIngest {
|
||||
pub async fn new(
|
||||
timeline: &Timeline,
|
||||
startpoint: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<WalIngest> {
|
||||
) -> Result<WalIngest, WalIngestError> {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
||||
@@ -145,7 +234,7 @@ impl WalIngest {
|
||||
interpreted: InterpretedWalRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
) -> Result<bool, WalIngestError> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
let prev_len = modification.len();
|
||||
|
||||
@@ -288,7 +377,7 @@ impl WalIngest {
|
||||
}
|
||||
|
||||
/// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
|
||||
fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
|
||||
let next_full_xid =
|
||||
enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
|
||||
|
||||
@@ -298,9 +387,9 @@ impl WalIngest {
|
||||
if xid > next_xid {
|
||||
// Wraparound occurred, must be from a prev epoch.
|
||||
if epoch == 0 {
|
||||
bail!(
|
||||
Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
|
||||
"apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
|
||||
);
|
||||
)))?;
|
||||
}
|
||||
epoch -= 1;
|
||||
}
|
||||
@@ -313,7 +402,7 @@ impl WalIngest {
|
||||
clear_vm_bits: ClearVmBits,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
@@ -402,7 +491,7 @@ impl WalIngest {
|
||||
create: DbaseCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseCreate {
|
||||
db_id,
|
||||
tablespace_id,
|
||||
@@ -505,7 +594,7 @@ impl WalIngest {
|
||||
dbase_drop: DbaseDrop,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let DbaseDrop {
|
||||
db_id,
|
||||
tablespace_ids,
|
||||
@@ -523,7 +612,7 @@ impl WalIngest {
|
||||
create: SmgrCreate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let SmgrCreate { rel } = create;
|
||||
self.put_rel_creation(modification, rel, ctx).await?;
|
||||
Ok(())
|
||||
@@ -537,7 +626,7 @@ impl WalIngest {
|
||||
truncate: XlSmgrTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let XlSmgrTruncate {
|
||||
blkno,
|
||||
rnode,
|
||||
@@ -689,7 +778,7 @@ impl WalIngest {
|
||||
record: XactRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (xact_common, is_commit, is_prepared) = match record {
|
||||
XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
|
||||
let xid: u64 = if modification.tline.pg_version >= 17 {
|
||||
@@ -813,7 +902,7 @@ impl WalIngest {
|
||||
truncate: ClogTruncate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogTruncate {
|
||||
pageno,
|
||||
oldest_xid,
|
||||
@@ -889,7 +978,7 @@ impl WalIngest {
|
||||
zero_page: ClogZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let ClogZeroPage { segno, rpageno } = zero_page;
|
||||
|
||||
self.put_slru_page_image(
|
||||
@@ -907,7 +996,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// Create WAL record for updating the multixact-offsets page
|
||||
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -1010,7 +1099,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let (maxsegment, startsegment, endsegment) =
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
cp.oldestMulti = xlrec.end_trunc_off;
|
||||
@@ -1058,7 +1147,7 @@ impl WalIngest {
|
||||
zero_page: MultiXactZeroPage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let MultiXactZeroPage {
|
||||
slru_kind,
|
||||
segno,
|
||||
@@ -1080,7 +1169,7 @@ impl WalIngest {
|
||||
update: RelmapUpdate,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RelmapUpdate { update, buf } = update;
|
||||
|
||||
modification
|
||||
@@ -1093,7 +1182,7 @@ impl WalIngest {
|
||||
raw_record: RawXlogRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let RawXlogRecord { info, lsn, mut buf } = raw_record;
|
||||
let pg_version = modification.tline.pg_version;
|
||||
|
||||
@@ -1235,12 +1324,12 @@ impl WalIngest {
|
||||
put: PutLogicalMessage,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let PutLogicalMessage { path, buf } = put;
|
||||
modification.put_file(path.as_str(), &buf, ctx).await
|
||||
}
|
||||
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
|
||||
fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
StandbyRecord::RunningXacts(running_xacts) => {
|
||||
enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
|
||||
@@ -1258,7 +1347,7 @@ impl WalIngest {
|
||||
&mut self,
|
||||
record: ReploriginRecord,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
match record {
|
||||
ReploriginRecord::Set(set) => {
|
||||
modification
|
||||
@@ -1278,7 +1367,7 @@ impl WalIngest {
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_creation(rel, 0, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1291,7 +1380,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_page_image(rel, blknum, img)?;
|
||||
@@ -1305,7 +1394,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
self.handle_rel_extend(modification, rel, blknum, ctx)
|
||||
.await?;
|
||||
modification.put_rel_wal_record(rel, blknum, rec)?;
|
||||
@@ -1318,7 +1407,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
modification.put_rel_truncation(rel, nblocks, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1329,7 +1418,7 @@ impl WalIngest {
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
let new_nblocks = blknum + 1;
|
||||
// Check if the relation exists. We implicitly create relations on first
|
||||
// record.
|
||||
@@ -1423,7 +1512,7 @@ impl WalIngest {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
if !self.shard.is_shard_zero() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -1441,7 +1530,7 @@ impl WalIngest {
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), WalIngestError> {
|
||||
// we don't use a cache for this like we do for relations. SLRUS are explcitly
|
||||
// extended with ZEROPAGE records, not with commit records, so it happens
|
||||
// a lot less frequently.
|
||||
@@ -1509,6 +1598,7 @@ async fn get_relsize(
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use super::*;
|
||||
@@ -1530,7 +1620,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
|
||||
async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
|
||||
for i in 14..=16 {
|
||||
dispatch_pgversion!(i, {
|
||||
pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
|
||||
|
||||
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
|
||||
/* GUCs */
|
||||
char *neon_timeline;
|
||||
char *neon_tenant;
|
||||
char *neon_project_id;
|
||||
char *neon_branch_id;
|
||||
char *neon_endpoint_id;
|
||||
int32 max_cluster_size;
|
||||
char *page_server_connstring;
|
||||
char *neon_auth_token;
|
||||
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.project_id",
|
||||
"Neon project_id the server is running on",
|
||||
NULL,
|
||||
&neon_project_id,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
DefineCustomStringVariable("neon.branch_id",
|
||||
"Neon branch_id the server is running on",
|
||||
NULL,
|
||||
&neon_branch_id,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
DefineCustomStringVariable("neon.endpoint_id",
|
||||
"Neon endpoint_id the server is running on",
|
||||
NULL,
|
||||
&neon_endpoint_id,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
check_neon_id, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.stripe_size",
|
||||
"sharding stripe size",
|
||||
NULL,
|
||||
|
||||
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp->config = config;
|
||||
wp->api = api;
|
||||
wp->state = WPS_COLLECTING_TERMS;
|
||||
wp->mconf.generation = INVALID_GENERATION;
|
||||
wp->mconf.members.len = 0;
|
||||
wp->mconf.new_members.len = 0;
|
||||
|
||||
wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
|
||||
|
||||
@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
|
||||
if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
|
||||
wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
|
||||
if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
|
||||
wp_log(FATAL, "enabling generations requires protocol version 3");
|
||||
wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
|
||||
|
||||
/* Fill the greeting package */
|
||||
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
|
||||
static bool
|
||||
WalProposerGenerationsEnabled(WalProposer *wp)
|
||||
{
|
||||
return wp->safekeepers_generation != 0;
|
||||
return wp->safekeepers_generation != INVALID_GENERATION;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
|
||||
}
|
||||
|
||||
/*
|
||||
* Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
|
||||
* members_safekeepers & new_members_safekeepers to sk.
|
||||
*/
|
||||
static void
|
||||
UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
|
||||
{
|
||||
/* members_safekeepers etc are fixed size, sanity check mconf size */
|
||||
if (wp->mconf.members.len > MAX_SAFEKEEPERS)
|
||||
wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
|
||||
if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
|
||||
wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
|
||||
|
||||
/* node id is not known until greeting is received */
|
||||
if (sk->state < SS_WAIT_VOTING)
|
||||
return;
|
||||
|
||||
/* 0 is assumed to be invalid node id, should never happen */
|
||||
if (sk->greetResponse.nodeId == 0)
|
||||
{
|
||||
wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32 i = 0; i < wp->mconf.members.len; i++)
|
||||
{
|
||||
SafekeeperId *sk_id = &wp->mconf.members.m[i];
|
||||
|
||||
if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
|
||||
{
|
||||
/*
|
||||
* If mconf or list of safekeepers to connect to changed (the
|
||||
* latter always currently goes through restart though),
|
||||
* ResetMemberSafekeeperPtrs is expected to be called before
|
||||
* UpdateMemberSafekeeperPtr. So, other value suggests that we are
|
||||
* connected to the same sk under different host name, complain
|
||||
* about that.
|
||||
*/
|
||||
if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
|
||||
{
|
||||
wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
|
||||
sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
|
||||
}
|
||||
wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
|
||||
sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
|
||||
wp->members_safekeepers[i] = sk;
|
||||
}
|
||||
}
|
||||
/* repeat for new_members */
|
||||
for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
|
||||
{
|
||||
SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
|
||||
|
||||
if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
|
||||
{
|
||||
if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
|
||||
{
|
||||
wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
|
||||
sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
|
||||
}
|
||||
wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
|
||||
sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
|
||||
wp->new_members_safekeepers[i] = sk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset wp->members_safekeepers & new_members_safekeepers and refill them.
|
||||
* Called after wp changes mconf.
|
||||
*/
|
||||
static void
|
||||
ResetMemberSafekeeperPtrs(WalProposer *wp)
|
||||
{
|
||||
memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
|
||||
memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
|
||||
UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static uint32
|
||||
MsetQuorum(MemberSet *mset)
|
||||
{
|
||||
Assert(mset->len > 0);
|
||||
return mset->len / 2 + 1;
|
||||
}
|
||||
|
||||
/* Does n forms quorum in mset? */
|
||||
static bool
|
||||
MsetHasQuorum(MemberSet *mset, uint32 n)
|
||||
{
|
||||
return n >= MsetQuorum(mset);
|
||||
}
|
||||
|
||||
/*
|
||||
* TermsCollected helper for a single member set `mset`.
|
||||
*
|
||||
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
|
||||
* or new_members_safekeepers.
|
||||
*/
|
||||
static bool
|
||||
TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
|
||||
{
|
||||
uint32 n_greeted = 0;
|
||||
|
||||
for (uint32 i = 0; i < wp->mconf.members.len; i++)
|
||||
{
|
||||
Safekeeper *sk = msk[i];
|
||||
|
||||
if (sk != NULL && sk->state == SS_WAIT_VOTING)
|
||||
{
|
||||
if (n_greeted > 0)
|
||||
appendStringInfoString(s, ", ");
|
||||
appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
|
||||
n_greeted++;
|
||||
}
|
||||
}
|
||||
appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
|
||||
return MsetHasQuorum(mset, n_greeted);
|
||||
}
|
||||
|
||||
/*
|
||||
* Have we received greeting from enough (quorum) safekeepers to start voting?
|
||||
*/
|
||||
static bool
|
||||
TermsCollected(WalProposer *wp)
|
||||
{
|
||||
return wp->n_connected >= wp->quorum;
|
||||
StringInfoData s; /* str for logging */
|
||||
bool collected = false;
|
||||
|
||||
/* legacy: generations disabled */
|
||||
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
|
||||
{
|
||||
collected = wp->n_connected >= wp->quorum;
|
||||
if (collected)
|
||||
{
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
|
||||
}
|
||||
return collected;
|
||||
}
|
||||
|
||||
/*
|
||||
* With generations enabled, we start campaign only when 1) some mconf is
|
||||
* actually received 2) we have greetings from majority of members as well
|
||||
* as from majority of new_members if it exists.
|
||||
*/
|
||||
if (wp->mconf.generation == INVALID_GENERATION)
|
||||
return false;
|
||||
|
||||
initStringInfo(&s);
|
||||
appendStringInfoString(&s, "mset greeters: ");
|
||||
if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
|
||||
goto res;
|
||||
if (wp->mconf.new_members.len > 0)
|
||||
{
|
||||
appendStringInfoString(&s, ", new_mset greeters: ");
|
||||
if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
|
||||
goto res;
|
||||
}
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
|
||||
collected = true;
|
||||
|
||||
res:
|
||||
pfree(s.data);
|
||||
return collected;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
pfree(mconf_toml);
|
||||
|
||||
/*
|
||||
* Adopt mconf of safekeepers if it is higher. TODO: mconf change should
|
||||
* restart wp if it started voting.
|
||||
* Adopt mconf of safekeepers if it is higher.
|
||||
*/
|
||||
if (sk->greetResponse.mconf.generation > wp->mconf.generation)
|
||||
{
|
||||
/* sanity check before adopting, should never happen */
|
||||
if (sk->greetResponse.mconf.members.len == 0)
|
||||
{
|
||||
wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we at least started campaign, restart wp to get elected in the
|
||||
* new mconf. Note: in principle once wp is already elected
|
||||
* re-election is not required, but being conservative here is not
|
||||
* bad.
|
||||
*
|
||||
* TODO: put mconf to shmem to immediately pick it up on start,
|
||||
* otherwise if some safekeeper(s) misses latest mconf and gets
|
||||
* connected the first, it may cause redundant restarts here.
|
||||
*
|
||||
* More generally, it would be nice to restart walproposer (wiping
|
||||
* election state) without restarting the process. In particular, that
|
||||
* would allow sync-safekeepers not to die here if it intersected with
|
||||
* sk migration (as well as remove 1s delay).
|
||||
*
|
||||
* Note that assign_neon_safekeepers also currently restarts the
|
||||
* process, so during normal migration walproposer may restart twice.
|
||||
*/
|
||||
if (wp->state >= WPS_CAMPAIGN)
|
||||
{
|
||||
wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
|
||||
}
|
||||
MembershipConfigurationFree(&wp->mconf);
|
||||
MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
|
||||
ResetMemberSafekeeperPtrs(wp);
|
||||
/* full conf was just logged above */
|
||||
wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
|
||||
}
|
||||
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
/* Protocol is all good, move to voting. */
|
||||
sk->state = SS_WAIT_VOTING;
|
||||
|
||||
/* In greeting safekeeper sent its id; update mappings accordingly. */
|
||||
UpdateMemberSafekeeperPtr(wp, sk);
|
||||
|
||||
/*
|
||||
* Note: it would be better to track the counter on per safekeeper basis,
|
||||
* but at worst walproposer would restart with 'term rejected', so leave
|
||||
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
/* We're still collecting terms from the majority. */
|
||||
wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
|
||||
|
||||
/* Quorum is acquried, prepare the vote request. */
|
||||
/* Quorum is acquired, prepare the vote request. */
|
||||
if (TermsCollected(wp))
|
||||
{
|
||||
wp->propTerm++;
|
||||
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||
|
||||
wp->state = WPS_CAMPAIGN;
|
||||
wp->voteRequest.pam.tag = 'v';
|
||||
wp->voteRequest.generation = wp->mconf.generation;
|
||||
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
|
||||
&sk->outbuf, wp->config->proto_version);
|
||||
|
||||
/* We have quorum for voting, send our vote request */
|
||||
wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
|
||||
wp->voteRequest.generation, wp->voteRequest.term);
|
||||
wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
|
||||
sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
|
||||
/* On failure, logging & resetting is handled */
|
||||
BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
|
||||
/* If successful, wait for read-ready with SS_WAIT_VERDICT */
|
||||
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
return;
|
||||
|
||||
wp_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
|
||||
"got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
|
||||
sk->voteResponse.voteGiven,
|
||||
GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* VotesCollected helper for a single member set `mset`.
|
||||
*
|
||||
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
|
||||
* or new_members_safekeepers.
|
||||
*/
|
||||
static bool
|
||||
VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
|
||||
{
|
||||
uint32 n_votes = 0;
|
||||
|
||||
for (uint32 i = 0; i < wp->mconf.members.len; i++)
|
||||
{
|
||||
Safekeeper *sk = msk[i];
|
||||
|
||||
if (sk != NULL && sk->state == SS_WAIT_ELECTED)
|
||||
{
|
||||
Assert(sk->voteResponse.voteGiven);
|
||||
|
||||
/*
|
||||
* Find the highest vote. NULL check is for the legacy case where
|
||||
* safekeeper might be not initialized with LSN at all and return
|
||||
* 0 LSN in the vote response; we still want to set donor to
|
||||
* something in this case.
|
||||
*/
|
||||
if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
|
||||
(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
|
||||
sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
|
||||
wp->donor == NULL)
|
||||
{
|
||||
wp->donorLastLogTerm = GetLastLogTerm(sk);
|
||||
wp->propTermStartLsn = sk->voteResponse.flushLsn;
|
||||
wp->donor = sk;
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
|
||||
if (n_votes > 0)
|
||||
appendStringInfoString(s, ", ");
|
||||
appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
|
||||
n_votes++;
|
||||
}
|
||||
}
|
||||
appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
|
||||
return MsetHasQuorum(mset, n_votes);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Checks if enough votes has been collected to get elected and if that's the
|
||||
* case finds the highest vote, setting donor, donorLastLogTerm,
|
||||
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
static bool
|
||||
VotesCollected(WalProposer *wp)
|
||||
{
|
||||
int n_ready = 0;
|
||||
StringInfoData s; /* str for logging */
|
||||
bool collected = false;
|
||||
|
||||
/* assumed to be called only when not elected yet */
|
||||
Assert(wp->state == WPS_CAMPAIGN);
|
||||
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
|
||||
wp->donorLastLogTerm = 0;
|
||||
wp->truncateLsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
/* legacy: generations disabled */
|
||||
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
|
||||
{
|
||||
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
|
||||
{
|
||||
n_ready++;
|
||||
int n_ready = 0;
|
||||
|
||||
if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
|
||||
(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
|
||||
wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
|
||||
{
|
||||
wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
|
||||
wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
|
||||
wp->donor = i;
|
||||
n_ready++;
|
||||
|
||||
if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
|
||||
(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
|
||||
wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
|
||||
wp->donor == NULL)
|
||||
{
|
||||
wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
|
||||
wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
|
||||
wp->donor = &wp->safekeeper[i];
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
}
|
||||
wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
|
||||
}
|
||||
collected = n_ready >= wp->quorum;
|
||||
if (collected)
|
||||
{
|
||||
wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
|
||||
}
|
||||
return collected;
|
||||
}
|
||||
|
||||
return n_ready >= wp->quorum;
|
||||
/*
|
||||
* if generations are enabled we're expected to get to voting only when
|
||||
* mconf is established.
|
||||
*/
|
||||
Assert(wp->mconf.generation != INVALID_GENERATION);
|
||||
|
||||
/*
|
||||
* We must get votes from both msets if both are present.
|
||||
*/
|
||||
initStringInfo(&s);
|
||||
appendStringInfoString(&s, "mset voters: ");
|
||||
if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
|
||||
goto res;
|
||||
if (wp->mconf.new_members.len > 0)
|
||||
{
|
||||
appendStringInfoString(&s, ", new_mset voters: ");
|
||||
if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
|
||||
goto res;
|
||||
}
|
||||
wp_log(LOG, "walproposer elected, %s", s.data);
|
||||
collected = true;
|
||||
|
||||
res:
|
||||
pfree(s.data);
|
||||
return collected;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
|
||||
* that only for logical replication (and switching logical walsenders to
|
||||
* neon_walreader is a todo.)
|
||||
*/
|
||||
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
|
||||
if (!wp->api.recovery_download(wp, wp->donor))
|
||||
{
|
||||
wp_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||
}
|
||||
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
|
||||
/*
|
||||
* Proposer's term history is the donor's + its own entry.
|
||||
*/
|
||||
dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
|
||||
dth = &wp->donor->voteResponse.termHistory;
|
||||
wp->propTermHistory.n_entries = dth->n_entries + 1;
|
||||
wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
|
||||
if (dth->n_entries > 0)
|
||||
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
|
||||
|
||||
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propTermStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
wp->donor->host, wp->donor->port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
|
||||
/*
|
||||
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
|
||||
readAnything = true;
|
||||
|
||||
/* should never happen: sk is expected to send ERROR instead */
|
||||
if (sk->appendResponse.generation != wp->mconf.generation)
|
||||
{
|
||||
wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
|
||||
sk->greetResponse.nodeId, sk->host, sk->port,
|
||||
sk->appendResponse.generation, wp->mconf.generation);
|
||||
}
|
||||
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/*
|
||||
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate WAL position acknowledged by quorum
|
||||
* GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
|
||||
*
|
||||
* `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
|
||||
* or new_members_safekeepers.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
|
||||
{
|
||||
XLogRecPtr responses[MAX_SAFEKEEPERS];
|
||||
|
||||
/*
|
||||
* Sort acknowledged LSNs
|
||||
* Ascending sort acknowledged LSNs.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
Assert(mset->len <= MAX_SAFEKEEPERS);
|
||||
for (uint32 i = 0; i < mset->len; i++)
|
||||
{
|
||||
Safekeeper *sk = msk[i];
|
||||
|
||||
/*
|
||||
* Like in Raft, we aren't allowed to commit entries from previous
|
||||
* terms, so ignore reported LSN until it gets to epochStartLsn.
|
||||
* terms, so ignore reported LSN until it gets to propTermStartLsn.
|
||||
*
|
||||
* Note: we ignore sk state, which is ok: before first ack flushLsn is
|
||||
* 0, and later we just preserve value across reconnections. It would
|
||||
* be ok to check for SS_ACTIVE as well.
|
||||
*/
|
||||
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
|
||||
if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
|
||||
{
|
||||
responses[i] = sk->appendResponse.flushLsn;
|
||||
}
|
||||
else
|
||||
{
|
||||
responses[i] = 0;
|
||||
}
|
||||
}
|
||||
qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
|
||||
qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);
|
||||
|
||||
/*
|
||||
* Get the smallest LSN committed by quorum
|
||||
* And get value committed by the quorum. A way to view this: to get the
|
||||
* highest value committed on the quorum, in the ordered array we skip n -
|
||||
* n_quorum elements to get to the first (lowest) value present on all sks
|
||||
* of the highest quorum.
|
||||
*/
|
||||
return responses[wp->n_safekeepers - wp->quorum];
|
||||
return responses[mset->len - MsetQuorum(mset)];
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate WAL position acknowledged by quorum, i.e. which may be regarded
|
||||
* committed.
|
||||
*
|
||||
* Zero may be returned when there is no quorum of nodes recovered to term start
|
||||
* lsn which sent feedback yet.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr committed;
|
||||
|
||||
/* legacy: generations disabled */
|
||||
if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
|
||||
{
|
||||
XLogRecPtr responses[MAX_SAFEKEEPERS];
|
||||
|
||||
/*
|
||||
* Sort acknowledged LSNs
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
/*
|
||||
* Like in Raft, we aren't allowed to commit entries from previous
|
||||
* terms, so ignore reported LSN until it gets to
|
||||
* propTermStartLsn.
|
||||
*
|
||||
* Note: we ignore sk state, which is ok: before first ack
|
||||
* flushLsn is 0, and later we just preserve value across
|
||||
* reconnections. It would be ok to check for SS_ACTIVE as well.
|
||||
*/
|
||||
responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
|
||||
}
|
||||
qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
|
||||
|
||||
/*
|
||||
* Get the smallest LSN committed by quorum
|
||||
*/
|
||||
return responses[wp->n_safekeepers - wp->quorum];
|
||||
}
|
||||
|
||||
committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
|
||||
if (wp->mconf.new_members.len > 0)
|
||||
{
|
||||
XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
|
||||
|
||||
committed = Min(committed, new_mset_committed);
|
||||
}
|
||||
return committed;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
|
||||
int i;
|
||||
XLogRecPtr donor_lsn = InvalidXLogRecPtr;
|
||||
|
||||
if (wp->n_votes < wp->quorum)
|
||||
if (wp->state < WPS_ELECTED)
|
||||
{
|
||||
wp_log(WARNING, "UpdateDonorShmem called before elections are won");
|
||||
return;
|
||||
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
|
||||
* about its position immediately after election before any feedbacks are
|
||||
* sent.
|
||||
*/
|
||||
if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
|
||||
if (wp->donor->state >= SS_WAIT_ELECTED)
|
||||
{
|
||||
donor = &wp->safekeeper[wp->donor];
|
||||
donor = wp->donor;
|
||||
donor_lsn = wp->propTermStartLsn;
|
||||
}
|
||||
|
||||
@@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
}
|
||||
|
||||
/*
|
||||
* Generally sync is done when majority switched the epoch so we committed
|
||||
* epochStartLsn and made the majority aware of it, ensuring they are
|
||||
* ready to give all WAL to pageserver. It would mean whichever majority
|
||||
* is alive, there will be at least one safekeeper who is able to stream
|
||||
* WAL to pageserver to make basebackup possible. However, since at the
|
||||
* moment we don't have any good mechanism of defining the healthy and
|
||||
* most advanced safekeeper who should push the wal into pageserver and
|
||||
* Generally sync is done when majority reached propTermStartLsn so we
|
||||
* committed it and made the majority aware of it, ensuring they are ready
|
||||
* to give all WAL to pageserver. It would mean whichever majority is
|
||||
* alive, there will be at least one safekeeper who is able to stream WAL
|
||||
* to pageserver to make basebackup possible. However, since at the moment
|
||||
* we don't have any good mechanism of defining the healthy and most
|
||||
* advanced safekeeper who should push the wal into pageserver and
|
||||
* basically the random one gets connected, to prevent hanging basebackup
|
||||
* (due to pageserver connecting to not-synced-safekeeper) we currently
|
||||
* wait for all seemingly alive safekeepers to get synced.
|
||||
@@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
|
||||
n_synced++;
|
||||
}
|
||||
|
||||
if (n_synced >= wp->quorum)
|
||||
if (newCommitLsn >= wp->propTermStartLsn)
|
||||
{
|
||||
/* A quorum of safekeepers has been synced! */
|
||||
|
||||
|
||||
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
|
||||
* This and following structs pair ones in membership.rs.
|
||||
*/
|
||||
typedef uint32 Generation;
|
||||
#define INVALID_GENERATION 0
|
||||
|
||||
typedef struct SafekeeperId
|
||||
{
|
||||
@@ -771,7 +772,17 @@ typedef struct WalProposer
|
||||
/* Current walproposer membership configuration */
|
||||
MembershipConfiguration mconf;
|
||||
|
||||
/* (n_safekeepers / 2) + 1 */
|
||||
/*
|
||||
* Parallels mconf.members with pointers to the member's slot in
|
||||
* safekeepers array of connections, or NULL if such member is not
|
||||
* connected. Helps to avoid looking slot per id through all
|
||||
* .safekeepers[] when doing quorum checks.
|
||||
*/
|
||||
Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
|
||||
/* As above, but for new_members. */
|
||||
Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
|
||||
|
||||
/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
|
||||
int quorum;
|
||||
|
||||
/*
|
||||
@@ -829,7 +840,7 @@ typedef struct WalProposer
|
||||
term_t donorLastLogTerm;
|
||||
|
||||
/* Most advanced acceptor */
|
||||
int donor;
|
||||
Safekeeper *donor;
|
||||
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
|
||||
@@ -31,6 +31,7 @@ pub async fn task_main_https(
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
let cert_resolver = ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&conf.ssl_key_file,
|
||||
&conf.ssl_cert_file,
|
||||
conf.ssl_cert_reload_period,
|
||||
|
||||
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
|
||||
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
|
||||
ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TimelineImportRequest,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
|
||||
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.step_down().await)
|
||||
// Spawn a background task: once we start stepping down, we must finish: if the client drops
|
||||
// their request we should avoid stopping in some part-stepped-down state.
|
||||
let handle = tokio::spawn(async move {
|
||||
let state = get_state(&req);
|
||||
state.service.step_down().await
|
||||
});
|
||||
|
||||
let result = handle
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
maybe_rate_limit(&req, tenant_id).await;
|
||||
|
||||
let mut req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
}
|
||||
ForwardOutcome::NotForwarded(req) => req,
|
||||
};
|
||||
|
||||
let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
|
||||
|
||||
let state = get_state(&req);
|
||||
|
||||
if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
|
||||
import_req.tenant_id,
|
||||
import_req.timeline_id
|
||||
)));
|
||||
}
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.timeline_import(import_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::Admin)?;
|
||||
|
||||
@@ -1949,6 +1991,16 @@ pub fn make_router(
|
||||
RequestName("debug_v1_tenant_locate"),
|
||||
)
|
||||
})
|
||||
.post(
|
||||
"/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
|
||||
|r| {
|
||||
named_request_span(
|
||||
r,
|
||||
handle_timeline_import,
|
||||
RequestName("debug_v1_timeline_import"),
|
||||
)
|
||||
},
|
||||
)
|
||||
.get("/debug/v1/scheduler", |r| {
|
||||
named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
|
||||
})
|
||||
|
||||
@@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
let https_listener = tcp_listener::bind(https_addr)?;
|
||||
|
||||
let resolver = ReloadingCertificateResolver::new(
|
||||
"main",
|
||||
&args.ssl_key_file,
|
||||
&args.ssl_cert_file,
|
||||
*args.ssl_cert_reload_period,
|
||||
|
||||
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::Gate;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::{failpoint_support, pausable_failpoint};
|
||||
|
||||
use crate::background_node_operations::{
|
||||
@@ -594,6 +594,8 @@ struct TenantShardSplitAbort {
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
/// Until this abort op is complete, no other operations may be done on the tenant
|
||||
_tenant_lock: TracingExclusiveGuard<TenantOperations>,
|
||||
/// The reconciler gate for the duration of the split operation, and any included abort.
|
||||
_gate: GateGuard,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -1460,7 +1462,7 @@ impl Service {
|
||||
// Retry until shutdown: we must keep this request object alive until it is properly
|
||||
// processed, as it holds a lock guard that prevents other operations trying to do things
|
||||
// to the tenant while it is in a weird part-split state.
|
||||
while !self.cancel.is_cancelled() {
|
||||
while !self.reconcilers_cancel.is_cancelled() {
|
||||
match self.abort_tenant_shard_split(&op).await {
|
||||
Ok(_) => break,
|
||||
Err(e) => {
|
||||
@@ -1473,9 +1475,12 @@ impl Service {
|
||||
// when we retry, so that the abort op will succeed. If the abort op is failing
|
||||
// for some other reason, we will keep retrying forever, or until a human notices
|
||||
// and does something about it (either fixing a pageserver or restarting the controller).
|
||||
tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
|
||||
.await
|
||||
.ok();
|
||||
tokio::time::timeout(
|
||||
Duration::from_secs(5),
|
||||
self.reconcilers_cancel.cancelled(),
|
||||
)
|
||||
.await
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1847,6 +1852,7 @@ impl Service {
|
||||
};
|
||||
|
||||
if insert {
|
||||
let config = attach_req.config.clone().unwrap_or_default();
|
||||
let tsp = TenantShardPersistence {
|
||||
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
|
||||
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
|
||||
@@ -1855,7 +1861,7 @@ impl Service {
|
||||
generation: attach_req.generation_override.or(Some(0)),
|
||||
generation_pageserver: None,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
config: serde_json::to_string(&config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
|
||||
.unwrap(),
|
||||
@@ -1878,16 +1884,16 @@ impl Service {
|
||||
Ok(()) => {
|
||||
tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(
|
||||
let mut shard = TenantShard::new(
|
||||
attach_req.tenant_shard_id,
|
||||
TenantShard::new(
|
||||
attach_req.tenant_shard_id,
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
None,
|
||||
),
|
||||
ShardIdentity::unsharded(),
|
||||
PlacementPolicy::Attached(0),
|
||||
None,
|
||||
);
|
||||
shard.config = config;
|
||||
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked.tenants.insert(attach_req.tenant_shard_id, shard);
|
||||
tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
@@ -1972,11 +1978,12 @@ impl Service {
|
||||
.set_attached(scheduler, attach_req.node_id);
|
||||
|
||||
tracing::info!(
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}",
|
||||
"attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
|
||||
attach_req.tenant_shard_id,
|
||||
tenant_shard.generation,
|
||||
// TODO: this is an odd number of 0xf's
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
|
||||
attach_req.config,
|
||||
);
|
||||
|
||||
// Trick the reconciler into not doing anything for this tenant: this helps
|
||||
@@ -4910,7 +4917,7 @@ impl Service {
|
||||
1,
|
||||
10,
|
||||
Duration::from_secs(5),
|
||||
&self.cancel,
|
||||
&self.reconcilers_cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -5161,6 +5168,11 @@ impl Service {
|
||||
)
|
||||
.await;
|
||||
|
||||
let _gate = self
|
||||
.reconcilers_gate
|
||||
.enter()
|
||||
.map_err(|_| ApiError::ShuttingDown)?;
|
||||
|
||||
let new_shard_count = ShardCount::new(split_req.new_shard_count);
|
||||
let new_stripe_size = split_req.new_stripe_size;
|
||||
|
||||
@@ -5188,6 +5200,7 @@ impl Service {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
_tenant_lock,
|
||||
_gate,
|
||||
})
|
||||
// Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
|
||||
.ok();
|
||||
@@ -5527,7 +5540,10 @@ impl Service {
|
||||
"failpoint".to_string()
|
||||
)));
|
||||
|
||||
failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"shard-split-post-remote-sleep",
|
||||
&self.reconcilers_cancel
|
||||
);
|
||||
|
||||
tracing::info!(
|
||||
"Split {} into {}",
|
||||
@@ -5585,7 +5601,7 @@ impl Service {
|
||||
stripe_size,
|
||||
preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
|
||||
},
|
||||
&self.cancel,
|
||||
&self.reconcilers_cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -8670,9 +8686,24 @@ impl Service {
|
||||
failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
|
||||
|
||||
self.inner.write().unwrap().step_down();
|
||||
// TODO: would it make sense to have a time-out for this?
|
||||
self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
|
||||
.await;
|
||||
|
||||
// Wait for reconciliations to stop, or terminate this process if they
|
||||
// fail to stop in time (this indicates a bug in shutdown)
|
||||
tokio::select! {
|
||||
_ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
|
||||
tracing::info!("Reconciliations stopped, proceeding with step down");
|
||||
}
|
||||
_ = async {
|
||||
failpoint_support::sleep_millis_async!("step-down-delay-timeout");
|
||||
tokio::time::sleep(Duration::from_secs(10)).await
|
||||
} => {
|
||||
tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
|
||||
|
||||
// The caller may proceed to act as leader when it sees this request fail: reduce the chance
|
||||
// of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
let mut global_observed = GlobalObservedState::default();
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
@@ -12,13 +12,16 @@ use crate::persistence::{
|
||||
use crate::safekeeper::Safekeeper;
|
||||
use anyhow::Context;
|
||||
use http_utils::error::ApiError;
|
||||
use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
|
||||
use pageserver_api::controller_api::{
|
||||
SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
|
||||
};
|
||||
use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
|
||||
use safekeeper_api::membership::{MemberSet, SafekeeperId};
|
||||
use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::Service;
|
||||
|
||||
@@ -298,6 +301,31 @@ impl Service {
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// Directly insert the timeline into the database without reconciling it with safekeepers.
|
||||
///
|
||||
/// Useful if the timeline already exists on the specified safekeepers,
|
||||
/// but we want to make it storage controller managed.
|
||||
pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
|
||||
let persistence = TimelinePersistence {
|
||||
tenant_id: req.tenant_id.to_string(),
|
||||
timeline_id: req.timeline_id.to_string(),
|
||||
start_lsn: Lsn::INVALID.into(),
|
||||
generation: 1,
|
||||
sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
|
||||
new_sk_set: None,
|
||||
cplane_notified_generation: 1,
|
||||
deleted_at: None,
|
||||
};
|
||||
let inserted = self.persistence.insert_timeline(persistence).await?;
|
||||
if inserted {
|
||||
tracing::info!("imported timeline into db");
|
||||
} else {
|
||||
tracing::info!("didn't import timeline into db, as it is already present in db");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
|
||||
pub(super) async fn tenant_timeline_delete_safekeepers(
|
||||
self: &Arc<Self>,
|
||||
|
||||
@@ -14,6 +14,7 @@ import threading
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from collections.abc import Mapping
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
@@ -79,7 +80,12 @@ from fixtures.remote_storage import (
|
||||
default_remote_storage,
|
||||
remote_storage_to_toml_dict,
|
||||
)
|
||||
from fixtures.safekeeper.http import SafekeeperHttpClient
|
||||
from fixtures.safekeeper.http import (
|
||||
MembershipConfiguration,
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperId,
|
||||
TimelineCreateRequest,
|
||||
)
|
||||
from fixtures.safekeeper.utils import wait_walreceivers_absent
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
@@ -1980,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
tenant_shard_id: TenantId | TenantShardId,
|
||||
pageserver_id: int,
|
||||
generation_override: int | None = None,
|
||||
config: None | dict[str, Any] = None,
|
||||
) -> int:
|
||||
body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
|
||||
if generation_override is not None:
|
||||
body["generation_override"] = generation_override
|
||||
if config is not None:
|
||||
body["config"] = config
|
||||
|
||||
response = self.request(
|
||||
"POST",
|
||||
@@ -2878,13 +2887,14 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
self,
|
||||
immediate: bool = False,
|
||||
timeout_in_seconds: int | None = None,
|
||||
extra_env_vars: dict[str, str] | None = None,
|
||||
):
|
||||
"""
|
||||
High level wrapper for restart: restarts the process, and waits for
|
||||
tenant state to stabilize.
|
||||
"""
|
||||
self.stop(immediate=immediate)
|
||||
self.start(timeout_in_seconds=timeout_in_seconds)
|
||||
self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
|
||||
self.quiesce_tenants()
|
||||
|
||||
def quiesce_tenants(self):
|
||||
@@ -2973,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
to call into the pageserver HTTP client.
|
||||
"""
|
||||
client = self.http_client()
|
||||
if generation is None:
|
||||
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
elif override_storage_controller_generation:
|
||||
if generation is None or override_storage_controller_generation:
|
||||
generation = self.env.storage_controller.attach_hook_issue(
|
||||
tenant_id, self.id, generation
|
||||
tenant_id,
|
||||
self.id,
|
||||
generation_override=generation if override_storage_controller_generation else None,
|
||||
config=config,
|
||||
)
|
||||
return client.tenant_attach(
|
||||
tenant_id,
|
||||
@@ -4291,28 +4302,29 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
def respec_deep(self, **kwargs: Any) -> None:
|
||||
"""
|
||||
Update the endpoint.json file taking into account nested keys.
|
||||
It does one level deep update. Should enough for most cases.
|
||||
Distinct method from respec() to do not break existing functionality.
|
||||
Update the spec.json file taking into account nested keys.
|
||||
Distinct method from respec() to not break existing functionality.
|
||||
NOTE: This method also updates the spec.json file, not endpoint.json.
|
||||
We need it because neon_local also writes to spec.json, so intended
|
||||
use-case is i) start endpoint with some config, ii) respec_deep(),
|
||||
iii) call reconfigure() to apply the changes.
|
||||
"""
|
||||
|
||||
def update(curr, patch):
|
||||
for k, v in patch.items():
|
||||
if isinstance(v, Mapping):
|
||||
curr[k] = update(curr.get(k, {}), v)
|
||||
else:
|
||||
curr[k] = v
|
||||
return curr
|
||||
|
||||
config_path = os.path.join(self.endpoint_path(), "spec.json")
|
||||
with open(config_path) as f:
|
||||
data_dict: dict[str, Any] = json.load(f)
|
||||
|
||||
log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if isinstance(value, dict):
|
||||
if key not in data_dict:
|
||||
data_dict[key] = value
|
||||
else:
|
||||
data_dict[key] = {**data_dict[key], **value}
|
||||
else:
|
||||
data_dict[key] = value
|
||||
update(data_dict, kwargs)
|
||||
|
||||
with open(config_path, "w") as file:
|
||||
log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
|
||||
@@ -4839,6 +4851,50 @@ class Safekeeper(LogUtils):
|
||||
|
||||
wait_until(paused)
|
||||
|
||||
@staticmethod
|
||||
def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
|
||||
return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
|
||||
|
||||
@staticmethod
|
||||
def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
|
||||
"""
|
||||
List of Safekeepers which are members in `mconf`.
|
||||
"""
|
||||
members_ids = [m.id for m in mconf.members]
|
||||
new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
|
||||
return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
|
||||
|
||||
@staticmethod
|
||||
def create_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ps: NeonPageserver,
|
||||
mconf: MembershipConfiguration,
|
||||
members_sks: list[Safekeeper],
|
||||
):
|
||||
"""
|
||||
Manually create timeline on safekeepers with given (presumably inital)
|
||||
mconf: figure out LSN from pageserver, bake request and execute it on
|
||||
given safekeepers.
|
||||
|
||||
Normally done by storcon, but some tests want to do it manually so far.
|
||||
"""
|
||||
ps_http_cli = ps.http_client()
|
||||
# figure out initial LSN.
|
||||
ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
|
||||
init_lsn = ps_timeline_detail["last_record_lsn"]
|
||||
log.info(f"initial LSN: {init_lsn}")
|
||||
# sk timeline creation request expects minor version
|
||||
pg_version = ps_timeline_detail["pg_version"] * 10000
|
||||
# create inital mconf
|
||||
create_r = TimelineCreateRequest(
|
||||
tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
|
||||
)
|
||||
log.info(f"sending timeline create: {create_r.to_json()}")
|
||||
|
||||
for sk in members_sks:
|
||||
sk.http_client().timeline_create(create_r)
|
||||
|
||||
|
||||
class NeonBroker(LogUtils):
|
||||
"""An object managing storage_broker instance"""
|
||||
|
||||
@@ -65,13 +65,11 @@ def single_timeline(
|
||||
assert ps_http.tenant_list() == []
|
||||
|
||||
def attach(tenant):
|
||||
# NB: create the new tenant in the storage controller with the correct tenant config. This
|
||||
# will pick up the existing tenant data from remote storage. If we just attach it to the
|
||||
# Pageserver, the storage controller will reset the tenant config to the default.
|
||||
env.create_tenant(
|
||||
tenant_id=tenant,
|
||||
timeline_id=template_timeline,
|
||||
conf=template_config,
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config,
|
||||
generation=100,
|
||||
override_storage_controller_generation=True,
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
|
||||
@@ -25,7 +25,7 @@ class Walreceiver:
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
mconf: Configuration | None
|
||||
mconf: MembershipConfiguration | None
|
||||
term: int
|
||||
last_log_term: int
|
||||
pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
|
||||
@@ -78,17 +78,17 @@ class SafekeeperId:
|
||||
|
||||
|
||||
@dataclass
|
||||
class Configuration:
|
||||
class MembershipConfiguration:
|
||||
generation: int
|
||||
members: list[SafekeeperId]
|
||||
new_members: list[SafekeeperId] | None
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> Configuration:
|
||||
def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
|
||||
generation = d["generation"]
|
||||
members = d["members"]
|
||||
new_members = d.get("new_members")
|
||||
return Configuration(generation, members, new_members)
|
||||
return MembershipConfiguration(generation, members, new_members)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self, cls=EnhancedJSONEncoder)
|
||||
@@ -98,7 +98,7 @@ class Configuration:
|
||||
class TimelineCreateRequest:
|
||||
tenant_id: TenantId
|
||||
timeline_id: TimelineId
|
||||
mconf: Configuration
|
||||
mconf: MembershipConfiguration
|
||||
# not exactly PgVersion, for example 150002 for 15.2
|
||||
pg_version: int
|
||||
start_lsn: Lsn
|
||||
@@ -110,13 +110,13 @@ class TimelineCreateRequest:
|
||||
|
||||
@dataclass
|
||||
class TimelineMembershipSwitchResponse:
|
||||
previous_conf: Configuration
|
||||
current_conf: Configuration
|
||||
previous_conf: MembershipConfiguration
|
||||
current_conf: MembershipConfiguration
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
|
||||
previous_conf = Configuration.from_json(d["previous_conf"])
|
||||
current_conf = Configuration.from_json(d["current_conf"])
|
||||
previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
|
||||
current_conf = MembershipConfiguration.from_json(d["current_conf"])
|
||||
return TimelineMembershipSwitchResponse(previous_conf, current_conf)
|
||||
|
||||
|
||||
@@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
# It is always normally not None, it is allowed only to make forward compat tests happy.
|
||||
mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
|
||||
mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
|
||||
return SafekeeperTimelineStatus(
|
||||
mconf=mconf,
|
||||
term=resj["acceptor_state"]["term"],
|
||||
@@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
return self.timeline_status(tenant_id, timeline_id).commit_lsn
|
||||
|
||||
# Get timeline membership configuration.
|
||||
def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
|
||||
def get_membership(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> MembershipConfiguration:
|
||||
# make mypy happy
|
||||
return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore
|
||||
|
||||
@@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
return res_json
|
||||
|
||||
def timeline_exclude(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
|
||||
) -> dict[str, Any]:
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
|
||||
@@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
|
||||
return res_json
|
||||
|
||||
def membership_switch(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
|
||||
) -> TimelineMembershipSwitchResponse:
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
|
||||
|
||||
@@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count(
|
||||
|
||||
n_txns = 500000
|
||||
|
||||
def setup_wrapper(env: NeonEnv):
|
||||
return setup_tenant_template(env, n_txns)
|
||||
|
||||
env = setup_pageserver_with_tenants(
|
||||
neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
|
||||
neon_env_builder,
|
||||
f"large_slru_count-{n_tenants}-{n_txns}",
|
||||
n_tenants,
|
||||
lambda env: setup_tenant_template(env, n_txns),
|
||||
)
|
||||
run_benchmark(env, pg_bin, record, duration)
|
||||
|
||||
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
|
||||
"gc_period": "0s", # disable periodic gc
|
||||
"checkpoint_timeout": "10 years",
|
||||
"compaction_period": "0s", # disable periodic compaction
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
|
||||
template_tenant, template_timeline = env.create_tenant(set_default=True)
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
import concurrent.futures
|
||||
import dataclasses
|
||||
import json
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
@@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
|
||||
mode: str = "pipelined"
|
||||
|
||||
|
||||
EXECUTION = ["concurrent-futures", "tasks"]
|
||||
EXECUTION = ["concurrent-futures"]
|
||||
|
||||
NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
|
||||
for max_batch_size in [1, 32]:
|
||||
for execution in EXECUTION:
|
||||
NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
|
||||
|
||||
BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
|
||||
for max_batch_size in [1, 2, 4, 8, 16, 32]:
|
||||
BATCHABLE: list[PageServicePipeliningConfig] = []
|
||||
for max_batch_size in [32]:
|
||||
for execution in EXECUTION:
|
||||
BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
|
||||
|
||||
@@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]:
|
||||
@pytest.mark.parametrize(
|
||||
"tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
|
||||
[
|
||||
# non-batchable workloads
|
||||
# (A separate benchmark will consider latency).
|
||||
*[
|
||||
(
|
||||
50,
|
||||
config,
|
||||
TARGET_RUNTIME,
|
||||
1,
|
||||
128,
|
||||
f"not batchable {dataclasses.asdict(config)}",
|
||||
)
|
||||
for config in NON_BATCHABLE
|
||||
],
|
||||
# batchable workloads should show throughput and CPU efficiency improvements
|
||||
*[
|
||||
(
|
||||
@@ -137,7 +127,14 @@ def test_throughput(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
# minimal lfc & small shared buffers to force requests to pageserver
|
||||
"neon.max_file_cache_size=1MB",
|
||||
"shared_buffers=10MB",
|
||||
],
|
||||
)
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
@@ -155,7 +152,6 @@ def test_throughput(
|
||||
tablesize = tablesize_mib * 1024 * 1024
|
||||
npages = tablesize // (8 * 1024)
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
|
||||
# TODO: can we force postgres to do sequential scans?
|
||||
|
||||
#
|
||||
# Run the workload, collect `Metrics` before and after, calculate difference, normalize.
|
||||
@@ -211,31 +207,73 @@ def test_throughput(
|
||||
).value,
|
||||
)
|
||||
|
||||
def workload() -> Metrics:
|
||||
def workload(disruptor_started: threading.Event) -> Metrics:
|
||||
disruptor_started.wait()
|
||||
start = time.time()
|
||||
iters = 0
|
||||
while time.time() - start < target_runtime or iters < 2:
|
||||
log.info("Seqscan %d", iters)
|
||||
if iters == 1:
|
||||
# round zero for warming up
|
||||
before = get_metrics()
|
||||
cur.execute(
|
||||
"select clear_buffer_cache()"
|
||||
) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
|
||||
cur.execute("select sum(data::bigint) from t")
|
||||
assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
|
||||
iters += 1
|
||||
after = get_metrics()
|
||||
return (after - before).normalize(iters - 1)
|
||||
|
||||
def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
iters = 0
|
||||
while True:
|
||||
cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
|
||||
if stop_disruptor.is_set():
|
||||
break
|
||||
disruptor_started.set()
|
||||
iters += 1
|
||||
time.sleep(0.001)
|
||||
return iters
|
||||
|
||||
env.pageserver.patch_config_toml_nonrecursive(
|
||||
{"page_service_pipelining": dataclasses.asdict(pipelining_config)}
|
||||
)
|
||||
env.pageserver.restart()
|
||||
metrics = workload()
|
||||
|
||||
# set trace for log analysis below
|
||||
env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
|
||||
|
||||
log.info("Starting workload")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
disruptor_started = threading.Event()
|
||||
stop_disruptor = threading.Event()
|
||||
disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
|
||||
workload_fut = executor.submit(workload, disruptor_started)
|
||||
metrics = workload_fut.result()
|
||||
stop_disruptor.set()
|
||||
ndisruptions = disruptor_fut.result()
|
||||
log.info("Disruptor issued %d disrupting requests", ndisruptions)
|
||||
|
||||
log.info("Results: %s", metrics)
|
||||
|
||||
since_last_start: list[str] = []
|
||||
for line in env.pageserver.logfile.read_text().splitlines():
|
||||
if "git:" in line:
|
||||
since_last_start = []
|
||||
since_last_start.append(line)
|
||||
|
||||
stopping_batching_because_re = re.compile(
|
||||
r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)"
|
||||
)
|
||||
reasons_for_stopping_batching = {}
|
||||
for line in since_last_start:
|
||||
match = stopping_batching_because_re.search(line)
|
||||
if match:
|
||||
if match.group(1) not in reasons_for_stopping_batching:
|
||||
reasons_for_stopping_batching[match.group(1)] = 0
|
||||
reasons_for_stopping_batching[match.group(1)] += 1
|
||||
|
||||
log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching)
|
||||
|
||||
#
|
||||
# Sanity-checks on the collected data
|
||||
#
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import json
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -16,7 +15,6 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
||||
|
||||
|
||||
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
|
||||
def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
"refill_interval": "100ms",
|
||||
"refill_amount": int(rate_limit_rps / 10),
|
||||
"max": int(rate_limit_rps / 10),
|
||||
"fair": True,
|
||||
},
|
||||
},
|
||||
)
|
||||
@@ -98,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
_, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
|
||||
|
||||
log.info("run pagebench")
|
||||
duration_secs = 10
|
||||
duration_secs = 20
|
||||
actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
|
||||
|
||||
log.info("validate the client is capped at the configured rps limit")
|
||||
expect_ncompleted = duration_secs * rate_limit_rps
|
||||
delta_abs = abs(expect_ncompleted - actual_ncompleted)
|
||||
threshold = 0.05 * expect_ncompleted
|
||||
assert threshold / rate_limit_rps < 0.1 * duration_secs, (
|
||||
"test self-test: unrealistic expecations regarding precision in this test"
|
||||
)
|
||||
assert delta_abs < 0.05 * expect_ncompleted, (
|
||||
assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
|
||||
"the throttling deviates more than 5percent from the expectation"
|
||||
)
|
||||
|
||||
@@ -122,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
timeout=compaction_period,
|
||||
)
|
||||
|
||||
log.info("validate the metrics")
|
||||
smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
|
||||
assert smgr_query_seconds_post is not None
|
||||
throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
|
||||
@@ -130,72 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
|
||||
actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
|
||||
actual_throttled_secs = actual_throttled_usecs / 1_000_000
|
||||
|
||||
log.info("validate that the metric doesn't include throttle wait time")
|
||||
assert duration_secs >= 10 * actual_smgr_query_seconds, (
|
||||
"smgr metrics should not include throttle wait time"
|
||||
)
|
||||
|
||||
log.info("validate that the throttling wait time metrics is correct")
|
||||
assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
|
||||
"most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
|
||||
"throttling and processing latency = total request time; this assert validates thi holds on average"
|
||||
)
|
||||
|
||||
|
||||
throttle_config_with_field_fair_set = {
|
||||
"task_kinds": ["PageRequestHandler"],
|
||||
"fair": True,
|
||||
"initial": 27,
|
||||
"refill_interval": "43s",
|
||||
"refill_amount": 23,
|
||||
"max": 42,
|
||||
}
|
||||
|
||||
|
||||
def assert_throttle_config_with_field_fair_set(conf):
|
||||
"""
|
||||
Field `fair` is ignored, so, responses don't contain it
|
||||
"""
|
||||
without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
|
||||
without_fair.pop("fair")
|
||||
|
||||
assert conf == without_fair
|
||||
|
||||
|
||||
def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
||||
"""
|
||||
env = neon_env_builder.init_start()
|
||||
vps_http = env.storage_controller.pageserver_api()
|
||||
# with_fair config should still be settable
|
||||
vps_http.set_tenant_config(
|
||||
env.initial_tenant,
|
||||
{"timeline_get_throttle": throttle_config_with_field_fair_set},
|
||||
)
|
||||
conf = vps_http.tenant_config(env.initial_tenant)
|
||||
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|
||||
assert_throttle_config_with_field_fair_set(
|
||||
conf.tenant_specific_overrides["timeline_get_throttle"]
|
||||
)
|
||||
|
||||
|
||||
def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
|
||||
"""
|
||||
|
||||
def set_tenant_config(ps_cfg):
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
|
||||
|
||||
neon_env_builder.pageserver_config_override = set_tenant_config
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
conf = ps_http.tenant_config(env.initial_tenant)
|
||||
assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
|
||||
# without this assertion, the test would pass even if the throttling was completely broken
|
||||
# but the request processing is so slow that it makes up for the latency that a correct throttling
|
||||
# implementation would add
|
||||
assert actual_smgr_query_seconds < 0.66 * duration_secs, (
|
||||
"test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
|
||||
)
|
||||
|
||||
@@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver,
|
||||
pageserver.tenant_location_configure(tenant_id, location_conf)
|
||||
last_state[pageserver.id] = (mode, generation)
|
||||
|
||||
if mode.startswith("Attached"):
|
||||
# It's only valid to connect to the last generation. Newer generations may yank layer
|
||||
# files used in older generations.
|
||||
last_generation = max(
|
||||
[s[1] for s in last_state.values() if s[1] is not None], default=None
|
||||
)
|
||||
|
||||
if mode.startswith("Attached") and generation == last_generation:
|
||||
# This is a basic test: we are validating that he endpoint works properly _between_
|
||||
# configuration changes. A stronger test would be to validate that clients see
|
||||
# no errors while we are making the changes.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import ssl
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
@@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
|
||||
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
|
||||
cur_cert = ssl.get_server_certificate(("localhost", port))
|
||||
assert cur_cert == sk_cert
|
||||
|
||||
|
||||
def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test metrics exported from http/https server and tls cert reloader.
|
||||
"""
|
||||
neon_env_builder.use_https_pageserver_api = True
|
||||
neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
|
||||
|
||||
ps_client = env.pageserver.http_client()
|
||||
|
||||
# 1. Test connection started metric.
|
||||
filter_https = {"scheme": "https"}
|
||||
old_https_conn_count = (
|
||||
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
|
||||
)
|
||||
|
||||
addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
|
||||
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
|
||||
|
||||
new_https_conn_count = (
|
||||
ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
|
||||
)
|
||||
# The counter should increase after the request,
|
||||
# but it may increase by more than one because of storcon requests.
|
||||
assert new_https_conn_count > old_https_conn_count
|
||||
|
||||
# 2. Test tls connection error.
|
||||
# Request without specified CA cert file should fail.
|
||||
with pytest.raises(requests.exceptions.SSLError):
|
||||
requests.get(addr)
|
||||
|
||||
tls_error_cnt = (
|
||||
ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
|
||||
)
|
||||
assert tls_error_cnt == 1
|
||||
|
||||
# 3. Test expiration time metric.
|
||||
expiration_time = datetime.fromtimestamp(
|
||||
ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
|
||||
)
|
||||
now = datetime.now()
|
||||
# neon_local generates certs valid for 100 years.
|
||||
# Compare with +-1 year to not care about leap years.
|
||||
assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
|
||||
|
||||
# 4. Test cert reload failed metric.
|
||||
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
|
||||
assert reload_error_cnt == 0
|
||||
|
||||
os.remove(env.pageserver.workdir / "server.crt")
|
||||
|
||||
def reload_failed():
|
||||
reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
|
||||
assert reload_error_cnt > 0
|
||||
|
||||
wait_until(reload_failed)
|
||||
|
||||
@@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer(
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("step_down_times_out", [False, True])
|
||||
def test_storage_controller_leadership_transfer_during_split(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
storage_controller_proxy: StorageControllerProxy,
|
||||
port_distributor: PortDistributor,
|
||||
step_down_times_out: bool,
|
||||
):
|
||||
"""
|
||||
Exercise a race between shard splitting and graceful leadership transfer. This is
|
||||
@@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
)
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
# We are testing scenarios where the step down API does not complete: either because it is stuck
|
||||
# doing a shard split, or because it totally times out on some other failpoint.
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*step_down.*request was dropped before completing.*",
|
||||
".*step_down.*operation timed out.*",
|
||||
".*Send step down request failed, will retry.*",
|
||||
".*Send step down request still failed after.*retries.*",
|
||||
".*Leader .+ did not respond to step-down request.*",
|
||||
]
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
# Start a shard split
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
@@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
)
|
||||
pause_failpoint = "shard-split-pre-complete"
|
||||
env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
|
||||
|
||||
if not step_down_times_out:
|
||||
# Prevent the timeout self-terminate code from executing: we will block step down on the
|
||||
# shard split itself
|
||||
env.storage_controller.configure_failpoints(
|
||||
("step-down-delay-timeout", "return(3600000)")
|
||||
)
|
||||
|
||||
split_fut = executor.submit(
|
||||
env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
|
||||
)
|
||||
@@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
|
||||
)
|
||||
|
||||
if step_down_times_out:
|
||||
# Step down will time out, original controller will terminate itself
|
||||
env.storage_controller.allowed_errors.extend([".*terminating process.*"])
|
||||
else:
|
||||
# Step down does not time out: original controller hits its shard split completion
|
||||
# code path and realises that it must not purge the parent shards from the database.
|
||||
env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
|
||||
|
||||
def passed_split_abort():
|
||||
try:
|
||||
log.info("Checking log for pattern...")
|
||||
assert env.storage_controller.log_contains(
|
||||
".*Using observed state received from leader.*"
|
||||
)
|
||||
# This log is indicative of entering startup_reconcile, which happens
|
||||
# after the point we would abort shard splits
|
||||
assert env.storage_controller.log_contains(".*Populating tenant shards.*")
|
||||
except Exception:
|
||||
log.exception("Failed to find pattern in log")
|
||||
raise
|
||||
@@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
|
||||
assert env.storage_controller.log_contains(".*Aborting shard split.*")
|
||||
|
||||
# Proxy is still talking to original controller here: disable its pause failpoint so
|
||||
# that its shard split can run to completion.
|
||||
log.info("Disabling failpoint")
|
||||
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
|
||||
# on handling the shard split request.
|
||||
env.storage_controller.request(
|
||||
"PUT",
|
||||
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
|
||||
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
if step_down_times_out:
|
||||
# We will let the old controller hit a timeout path where it terminates itself, rather than
|
||||
# completing step_down and trying to complete a shard split
|
||||
def old_controller_terminated():
|
||||
assert env.storage_controller.log_contains(".*terminating process.*")
|
||||
|
||||
def previous_stepped_down():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.STEPPED_DOWN
|
||||
wait_until(old_controller_terminated)
|
||||
else:
|
||||
# Proxy is still talking to original controller here: disable its pause failpoint so
|
||||
# that its shard split can run to completion.
|
||||
log.info("Disabling failpoint")
|
||||
# Bypass the proxy: the python test HTTPServer is single threaded and still blocked
|
||||
# on handling the shard split request.
|
||||
env.storage_controller.request(
|
||||
"PUT",
|
||||
f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
|
||||
json=[{"name": "shard-split-pre-complete", "actions": "off"}],
|
||||
headers=env.storage_controller.headers(TokenScope.ADMIN),
|
||||
)
|
||||
|
||||
log.info("Awaiting step down")
|
||||
wait_until(previous_stepped_down)
|
||||
def previous_stepped_down():
|
||||
assert (
|
||||
env.storage_controller.get_leadership_status()
|
||||
== StorageControllerLeadershipStatus.STEPPED_DOWN
|
||||
)
|
||||
|
||||
# Let the shard split complete: this may happen _after_ the replacement has come up
|
||||
# and tried to clean up the databases
|
||||
log.info("Unblocking & awaiting shard split")
|
||||
with pytest.raises(Exception, match="Unexpected child shard count"):
|
||||
# This split fails when it tries to persist results, because it encounters
|
||||
# changes already made by the new controller's abort-on-startup
|
||||
split_fut.result()
|
||||
log.info("Awaiting step down")
|
||||
wait_until(previous_stepped_down)
|
||||
|
||||
# Let the shard split complete: this may happen _after_ the replacement has come up
|
||||
# and tried to clean up the databases
|
||||
log.info("Unblocking & awaiting shard split")
|
||||
with pytest.raises(Exception, match="Unexpected child shard count"):
|
||||
# This split fails when it tries to persist results, because it encounters
|
||||
# changes already made by the new controller's abort-on-startup
|
||||
split_fut.result()
|
||||
|
||||
log.info("Routing to new leader")
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
|
||||
@@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split(
|
||||
env.storage_controller.wait_until_ready()
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Check that the stepped down instance forwards requests
|
||||
# to the new leader while it's still running.
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
|
||||
status = env.storage_controller.node_status(env.pageservers[0].id)
|
||||
assert status["scheduling"] == "Pause"
|
||||
if not step_down_times_out:
|
||||
# Check that the stepped down instance forwards requests
|
||||
# to the new leader while it's still running.
|
||||
storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
|
||||
env.storage_controller.tenant_shard_dump()
|
||||
env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
|
||||
status = env.storage_controller.node_status(env.pageservers[0].id)
|
||||
assert status["scheduling"] == "Pause"
|
||||
|
||||
|
||||
def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
@@ -45,7 +45,7 @@ from fixtures.remote_storage import (
|
||||
s3_storage,
|
||||
)
|
||||
from fixtures.safekeeper.http import (
|
||||
Configuration,
|
||||
MembershipConfiguration,
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperId,
|
||||
TimelineCreateRequest,
|
||||
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
cli = sk.http_client()
|
||||
mconf = Configuration(generation=0, members=[], new_members=None)
|
||||
mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
|
||||
# set start_lsn to the beginning of the first segment to allow reading
|
||||
# WAL from there (could you intidb LSN as well).
|
||||
r = TimelineCreateRequest(
|
||||
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
|
||||
sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock
|
||||
|
||||
# Request to switch before timeline creation should fail.
|
||||
init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
|
||||
init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
http_cli.membership_switch(tenant_id, timeline_id, init_conf)
|
||||
|
||||
@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
|
||||
http_cli.timeline_create(create_r)
|
||||
|
||||
# Switch into some conf.
|
||||
joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
|
||||
joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
|
||||
log.info(f"joint switch resp: {resp}")
|
||||
assert resp.previous_conf.generation == 1
|
||||
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
|
||||
assert after_restart.generation == 4
|
||||
|
||||
# Switch into non joint conf of which sk is not a member, must fail.
|
||||
non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
|
||||
non_joint_not_member = MembershipConfiguration(
|
||||
generation=5, members=[sk_id_2], new_members=None
|
||||
)
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)
|
||||
|
||||
# Switch into good non joint conf.
|
||||
non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
|
||||
non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
|
||||
resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
|
||||
log.info(f"non joint switch resp: {resp}")
|
||||
assert resp.previous_conf.generation == 4
|
||||
assert resp.current_conf.generation == 6
|
||||
|
||||
# Switch request to lower conf should be rejected.
|
||||
lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
|
||||
lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
|
||||
|
||||
# Now, exclude sk from the membership, timeline should be deleted.
|
||||
excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
|
||||
excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
|
||||
http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
|
||||
with pytest.raises(requests.exceptions.HTTPError):
|
||||
http_cli.timeline_status(tenant_id, timeline_id)
|
||||
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
ps = env.pageservers[0]
|
||||
ps_http_cli = ps.http_client()
|
||||
|
||||
http_clis = [sk.http_client() for sk in env.safekeepers]
|
||||
|
||||
config_lines = [
|
||||
"neon.safekeeper_proto_version = 3",
|
||||
]
|
||||
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
|
||||
# expected to fail because timeline is not created on safekeepers
|
||||
with pytest.raises(Exception, match=r".*timed out.*"):
|
||||
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
|
||||
# figure out initial LSN.
|
||||
ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
|
||||
init_lsn = ps_timeline_detail["last_record_lsn"]
|
||||
log.info(f"initial LSN: {init_lsn}")
|
||||
# sk timeline creation request expects minor version
|
||||
pg_version = ps_timeline_detail["pg_version"] * 10000
|
||||
# create inital mconf
|
||||
sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
|
||||
mconf = Configuration(generation=1, members=sk_ids, new_members=None)
|
||||
create_r = TimelineCreateRequest(
|
||||
tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
|
||||
mconf = MembershipConfiguration(
|
||||
generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
|
||||
)
|
||||
log.info(f"sending timeline create: {create_r.to_json()}")
|
||||
|
||||
for sk_http_cli in http_clis:
|
||||
sk_http_cli.timeline_create(create_r)
|
||||
Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
|
||||
# Once timeline created endpoint should start.
|
||||
ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
|
||||
ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
|
||||
|
||||
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
|
||||
Safekeeper,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.safekeeper.http import MembershipConfiguration
|
||||
from fixtures.utils import skip_in_debug_build
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
|
||||
asyncio.run(run_concurrent_computes(env))
|
||||
|
||||
|
||||
async def assert_query_hangs(endpoint: Endpoint, query: str):
|
||||
"""
|
||||
Start on endpoint query which is expected to hang and check that it does.
|
||||
"""
|
||||
conn = await endpoint.connect_async()
|
||||
bg_query = asyncio.create_task(conn.execute(query))
|
||||
await asyncio.sleep(2)
|
||||
assert not bg_query.done()
|
||||
return bg_query
|
||||
|
||||
|
||||
# Stop safekeeper and check that query cannot be executed while safekeeper is down.
|
||||
# Query will insert a single row into a table.
|
||||
async def check_unavailability(
|
||||
sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
|
||||
):
|
||||
async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
|
||||
# shutdown one of two acceptors, that is, majority
|
||||
sk.stop()
|
||||
|
||||
bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
|
||||
|
||||
await asyncio.sleep(start_delay_sec)
|
||||
# ensure that the query has not been executed yet
|
||||
assert not bg_query.done()
|
||||
|
||||
bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
|
||||
# start safekeeper and await the query
|
||||
sk.start()
|
||||
await bg_query
|
||||
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
|
||||
await conn.execute("INSERT INTO t values (1, 'payload')")
|
||||
|
||||
# stop safekeeper and check that query cannot be executed while safekeeper is down
|
||||
await check_unavailability(env.safekeepers[0], conn, 2)
|
||||
await check_unavailability(env.safekeepers[0], endpoint, 2)
|
||||
|
||||
# for the world's balance, do the same with second safekeeper
|
||||
await check_unavailability(env.safekeepers[1], conn, 3)
|
||||
await check_unavailability(env.safekeepers[1], endpoint, 3)
|
||||
|
||||
# check that we can execute queries after restart
|
||||
await conn.execute("INSERT INTO t values (4, 'payload')")
|
||||
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
|
||||
# insert with only one safekeeper up to create tail of flushed but not committed WAL
|
||||
sk1.stop()
|
||||
sk2.stop()
|
||||
conn = await ep.connect_async()
|
||||
# query should hang, so execute in separate task
|
||||
bg_query = asyncio.create_task(
|
||||
conn.execute("insert into t select generate_series(1, 2000), 'payload'")
|
||||
)
|
||||
sleep_sec = 2
|
||||
await asyncio.sleep(sleep_sec)
|
||||
# it must still be not finished
|
||||
assert not bg_query.done()
|
||||
await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
|
||||
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
|
||||
ep.stop_and_destroy()
|
||||
|
||||
@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
|
||||
# insert with only one sk3 up to create tail of flushed but not committed WAL on it
|
||||
sk1.stop()
|
||||
sk2.stop()
|
||||
conn = await ep.connect_async()
|
||||
# query should hang, so execute in separate task
|
||||
bg_query = asyncio.create_task(
|
||||
conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
|
||||
)
|
||||
sleep_sec = 2
|
||||
await asyncio.sleep(sleep_sec)
|
||||
# it must still be not finished
|
||||
assert not bg_query.done()
|
||||
await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
|
||||
# note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
|
||||
ep.stop_and_destroy()
|
||||
|
||||
@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
|
||||
asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
|
||||
|
||||
|
||||
async def quorum_sanity_single(
|
||||
env: NeonEnv,
|
||||
compute_sks_ids: list[int],
|
||||
members_sks_ids: list[int],
|
||||
new_members_sks_ids: list[int] | None,
|
||||
sks_to_stop_ids: list[int],
|
||||
should_work_when_stopped: bool,
|
||||
):
|
||||
"""
|
||||
*_ids params contain safekeeper node ids; it is assumed they are issued
|
||||
from 1 and sequentially assigned to env.safekeepers.
|
||||
"""
|
||||
members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
|
||||
new_members_sks = (
|
||||
[env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
|
||||
)
|
||||
sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
|
||||
|
||||
mconf = MembershipConfiguration(
|
||||
generation=1,
|
||||
members=Safekeeper.sks_to_safekeeper_ids(members_sks),
|
||||
new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
|
||||
)
|
||||
members_sks = Safekeeper.mconf_sks(env, mconf)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
|
||||
members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
|
||||
new_members_sks_ids_str = "-".join(
|
||||
[str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
|
||||
)
|
||||
sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
|
||||
log.info(
|
||||
f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
|
||||
)
|
||||
branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
|
||||
timeline_id = env.create_branch(branch_name)
|
||||
|
||||
# create timeline on `members_sks`
|
||||
Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
|
||||
|
||||
config_lines = [
|
||||
"neon.safekeeper_proto_version = 3",
|
||||
]
|
||||
ep = env.endpoints.create(branch_name, config_lines=config_lines)
|
||||
ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
|
||||
# stop specified sks and check whether writes work
|
||||
for sk in sks_to_stop:
|
||||
sk.stop()
|
||||
if should_work_when_stopped:
|
||||
log.info("checking that writes still work")
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
|
||||
# restarting ep should also be fine
|
||||
ep.stop()
|
||||
ep.start()
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
|
||||
bg_query = None
|
||||
else:
|
||||
log.info("checking that writes hang")
|
||||
bg_query = await assert_query_hangs(
|
||||
ep, "insert into t select generate_series(1, 100), 'Papaya'"
|
||||
)
|
||||
# start again; now they should work
|
||||
for sk in sks_to_stop:
|
||||
sk.start()
|
||||
if bg_query:
|
||||
log.info("awaiting query")
|
||||
await bg_query
|
||||
|
||||
|
||||
# It's a bit tempting to iterate over all possible combinations, but let's stick
|
||||
# with this for now.
|
||||
async def run_quorum_sanity(env: NeonEnv):
|
||||
# 3 members, all up, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
|
||||
# 3 members, 2/3 up, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
|
||||
# 3 members, 1/3 up, should not work
|
||||
await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
|
||||
|
||||
# 3 members, all up, should work; wp redundantly talks to 4th.
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
|
||||
# 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
|
||||
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
|
||||
# 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
|
||||
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
|
||||
|
||||
# joint conf of 1-2-3 and 4, all up, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
|
||||
# joint conf of 1-2-3 and 4, 4 down, shouldn't work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
|
||||
|
||||
# joint conf of 1-2-3 and 2-3-4, all up, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
|
||||
# joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
|
||||
# joint conf of 1-2-3 and 2-3-4, 2 down, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
|
||||
# joint conf of 1-2-3 and 2-3-4, 3 down, should work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
|
||||
# joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
|
||||
# joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
|
||||
await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
|
||||
|
||||
# joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
|
||||
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
|
||||
# with 1 down should still be ok
|
||||
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
|
||||
# but with 2 down not ok
|
||||
await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
|
||||
|
||||
|
||||
# Test various combinations of membership configurations / neon.safekeepers
|
||||
# (list of safekeepers endpoint connects to) values / up & down safekeepers and
|
||||
# check that endpont can start and write data when we have quorum and can't when
|
||||
# we don't.
|
||||
def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 4
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
asyncio.run(run_quorum_sanity(env))
|
||||
|
||||
|
||||
async def run_segment_init_failure(env: NeonEnv):
|
||||
env.create_branch("test_segment_init_failure")
|
||||
ep = env.endpoints.create_start("test_segment_init_failure")
|
||||
|
||||
Reference in New Issue
Block a user