From 8a72e6f8884c975c05457a280af2f02056360a6e Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 10 Apr 2025 12:45:17 +0400 Subject: [PATCH 01/17] pageserver: add enable_tls_page_service_api (#11508) ## Problem Page service doesn't use TLS for incoming requests. - Closes: https://github.com/neondatabase/cloud/issues/27236 ## Summary of changes - Add option `enable_tls_page_service_api` to pageserver config - Propagate `tls_server_config` to `page_service` if the option is enabled No integration tests for now because I didn't find out how to call page service API from python and AFAIK computes don't support TLS yet --- libs/pageserver_api/src/config.rs | 2 ++ pageserver/src/bin/pageserver.rs | 36 ++++++++++++++++++++++--------- pageserver/src/config.rs | 7 ++++++ pageserver/src/page_service.rs | 8 ++++++- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 8f56d60a4a..bd9f7efb7f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -180,6 +180,7 @@ pub struct ConfigToml { #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, pub tracing: Option, + pub enable_tls_page_service_api: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -631,6 +632,7 @@ impl Default for ConfigToml { load_previous_heatmap: None, generate_unarchival_heatmap: None, tracing: None, + enable_tls_page_service_api: false, } } } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 54fecee588..2740f81758 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -452,6 +452,23 @@ fn start_pageserver( info!("Using auth for http API: {:#?}", conf.http_auth_type); info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); + let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api + { + let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ))?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_cert_resolver(resolver); + + Some(Arc::new(server_config)) + } else { + None + }; + match var("NEON_AUTH_TOKEN") { Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); @@ -670,17 +687,11 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new( - &conf.ssl_key_file, - &conf.ssl_cert_file, - conf.ssl_cert_reload_period, - ))?; + let tls_server_config = tls_server_config + .clone() + .expect("tls_server_config is set earlier if https is enabled"); - let server_config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_cert_resolver(resolver); - - let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config); let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; @@ -736,6 +747,11 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener) .context("create tokio listener")? }, + if conf.enable_tls_page_service_api { + tls_server_config + } else { + None + }, ); // All started up! Now just sit and wait for shutdown signal. diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ccc29e59d4..26ae6af70e 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -219,6 +219,11 @@ pub struct PageServerConf { pub generate_unarchival_heatmap: bool, pub tracing: Option, + + /// Enable TLS in page service API. + /// Does not force TLS: the client negotiates TLS usage during the handshake. + /// Uses key and certificate from ssl_key_file/ssl_cert_file. + pub enable_tls_page_service_api: bool, } /// Token for authentication to safekeepers @@ -391,6 +396,7 @@ impl PageServerConf { load_previous_heatmap, generate_unarchival_heatmap, tracing, + enable_tls_page_service_api, } = config_toml; let mut conf = PageServerConf { @@ -441,6 +447,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, tracing, + enable_tls_page_service_api, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7e3991dbdc..61f524fc29 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -105,6 +105,7 @@ pub fn spawn( pg_auth: Option>, perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, + tls_config: Option>, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -124,6 +125,7 @@ pub fn spawn( perf_trace_dispatch, tcp_listener, conf.pg_auth_type, + tls_config, conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), @@ -181,6 +183,7 @@ pub async fn libpq_listener_main( perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, @@ -223,6 +226,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, + tls_config.clone(), pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), @@ -264,6 +268,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, @@ -334,7 +339,8 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; + let pgbackend = + PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { From fae7528adb7a497d04a37f61aeccbbfb8e207f9e Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Thu, 10 Apr 2025 12:55:37 +0300 Subject: [PATCH 02/17] walproposer: make it aware of membership (#11407) ## Problem Walproposer should get elected and commit WAL on safekeepers specified by the membership configuration. ## Summary of changes - Add to wp `members_safekeepers` and `new_members_safekeepers` arrays mapping configuration members to connection slots. Establish this mapping (by node id) when safekeeper sends greeting, giving its id and when mconf becomes known / changes. - Add to TermsCollected, VotesCollected, GetAcknowledgedByQuorumWALPosition membership aware logic. Currently it partially duplicates existing one, but we'll drop the latter eventually. - In python, rename Configuration to MembershipConfiguration for clarity. - Add test_quorum_sanity testing new logic. ref https://github.com/neondatabase/neon/issues/10851 --- pgxn/neon/walproposer.c | 459 ++++++++++++++++-- pgxn/neon/walproposer.h | 15 +- test_runner/fixtures/neon_fixtures.py | 51 +- test_runner/fixtures/safekeeper/http.py | 28 +- test_runner/regress/test_wal_acceptor.py | 40 +- .../regress/test_wal_acceptor_async.py | 173 +++++-- 6 files changed, 644 insertions(+), 122 deletions(-) diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 0336d63e8d..6b133e4dc4 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; wp->state = WPS_COLLECTING_TERMS; + wp->mconf.generation = INVALID_GENERATION; + wp->mconf.members.len = 0; + wp->mconf.new_members.len = 0; wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); @@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->config->proto_version != 2 && wp->config->proto_version != 3) wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) + wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); /* Fill the greeting package */ @@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp) static bool WalProposerGenerationsEnabled(WalProposer *wp) { - return wp->safekeepers_generation != 0; + return wp->safekeepers_generation != INVALID_GENERATION; } /* @@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk) BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } +/* + * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in + * members_safekeepers & new_members_safekeepers to sk. + */ +static void +UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) +{ + /* members_safekeepers etc are fixed size, sanity check mconf size */ + if (wp->mconf.members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len); + if (wp->mconf.new_members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len); + + /* node id is not known until greeting is received */ + if (sk->state < SS_WAIT_VOTING) + return; + + /* 0 is assumed to be invalid node id, should never happen */ + if (sk->greetResponse.nodeId == 0) + { + wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port); + return; + } + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.members.m[i]; + + if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + { + /* + * If mconf or list of safekeepers to connect to changed (the + * latter always currently goes through restart though), + * ResetMemberSafekeeperPtrs is expected to be called before + * UpdateMemberSafekeeperPtr. So, other value suggests that we are + * connected to the same sk under different host name, complain + * about that. + */ + if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->members_safekeepers[i] = sk; + } + } + /* repeat for new_members */ + for (uint32 i = 0; i < wp->mconf.new_members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; + + if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + { + if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->new_members_safekeepers[i] = sk; + } + } +} + +/* + * Reset wp->members_safekeepers & new_members_safekeepers and refill them. + * Called after wp changes mconf. + */ +static void +ResetMemberSafekeeperPtrs(WalProposer *wp) +{ + memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state >= SS_WAIT_VOTING) + UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]); + } +} + +static uint32 +MsetQuorum(MemberSet *mset) +{ + Assert(mset->len > 0); + return mset->len / 2 + 1; +} + +/* Does n forms quorum in mset? */ +static bool +MsetHasQuorum(MemberSet *mset, uint32 n) +{ + return n >= MsetQuorum(mset); +} + +/* + * TermsCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_greeted = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_VOTING) + { + if (n_greeted > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_greeted++; + } + } + appendStringInfo(s, ", %u/%u total", n_greeted, mset->len); + return MsetHasQuorum(mset, n_greeted); +} + /* * Have we received greeting from enough (quorum) safekeepers to start voting? */ static bool TermsCollected(WalProposer *wp) { - return wp->n_connected >= wp->quorum; + StringInfoData s; /* str for logging */ + bool collected = false; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + collected = wp->n_connected >= wp->quorum; + if (collected) + { + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm); + } + return collected; + } + + /* + * With generations enabled, we start campaign only when 1) some mconf is + * actually received 2) we have greetings from majority of members as well + * as from majority of new_members if it exists. + */ + if (wp->mconf.generation == INVALID_GENERATION) + return false; + + initStringInfo(&s); + appendStringInfoString(&s, "mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm); + collected = true; + +res: + pfree(s.data); + return collected; } static void @@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk) pfree(mconf_toml); /* - * Adopt mconf of safekeepers if it is higher. TODO: mconf change should - * restart wp if it started voting. + * Adopt mconf of safekeepers if it is higher. */ if (sk->greetResponse.mconf.generation > wp->mconf.generation) { + /* sanity check before adopting, should never happen */ + if (sk->greetResponse.mconf.members.len == 0) + { + wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation); + } + + /* + * If we at least started campaign, restart wp to get elected in the + * new mconf. Note: in principle once wp is already elected + * re-election is not required, but being conservative here is not + * bad. + * + * TODO: put mconf to shmem to immediately pick it up on start, + * otherwise if some safekeeper(s) misses latest mconf and gets + * connected the first, it may cause redundant restarts here. + * + * More generally, it would be nice to restart walproposer (wiping + * election state) without restarting the process. In particular, that + * would allow sync-safekeepers not to die here if it intersected with + * sk migration (as well as remove 1s delay). + * + * Note that assign_neon_safekeepers also currently restarts the + * process, so during normal migration walproposer may restart twice. + */ + if (wp->state >= WPS_CAMPAIGN) + { + wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation); + } MembershipConfigurationFree(&wp->mconf); MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + ResetMemberSafekeeperPtrs(wp); /* full conf was just logged above */ wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); } @@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* Protocol is all good, move to voting. */ sk->state = SS_WAIT_VOTING; + /* In greeting safekeeper sent its id; update mappings accordingly. */ + UpdateMemberSafekeeperPtr(wp, sk); + /* * Note: it would be better to track the counter on per safekeeper basis, * but at worst walproposer would restart with 'term rejected', so leave @@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); - /* Quorum is acquried, prepare the vote request. */ + /* Quorum is acquired, prepare the vote request. */ if (TermsCollected(wp)) { - wp->propTerm++; - wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->state = WPS_CAMPAIGN; wp->voteRequest.pam.tag = 'v'; wp->voteRequest.generation = wp->mconf.generation; @@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk) &sk->outbuf, wp->config->proto_version); /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, - wp->voteRequest.generation, wp->voteRequest.term); + wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT, + sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term); /* On failure, logging & resetting is handled */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ @@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), @@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk) } } +/* + * VotesCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_votes = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_ELECTED) + { + Assert(sk->voteResponse.voteGiven); + + /* + * Find the highest vote. NULL check is for the legacy case where + * safekeeper might be not initialized with LSN at all and return + * 0 LSN in the vote response; we still want to set donor to + * something in this case. + */ + if (GetLastLogTerm(sk) > wp->donorLastLogTerm || + (GetLastLogTerm(sk) == wp->donorLastLogTerm && + sk->voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(sk); + wp->propTermStartLsn = sk->voteResponse.flushLsn; + wp->donor = sk; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + + if (n_votes > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_votes++; + } + } + appendStringInfo(s, ", %u/%u total", n_votes, mset->len); + return MsetHasQuorum(mset, n_votes); +} + + /* * Checks if enough votes has been collected to get elected and if that's the * case finds the highest vote, setting donor, donorLastLogTerm, @@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk) static bool VotesCollected(WalProposer *wp) { - int n_ready = 0; + StringInfoData s; /* str for logging */ + bool collected = false; /* assumed to be called only when not elected yet */ Assert(wp->state == WPS_CAMPAIGN); @@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp) wp->donorLastLogTerm = 0; wp->truncateLsn = InvalidXLogRecPtr; - for (int i = 0; i < wp->n_safekeepers; i++) + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { - if (wp->safekeeper[i].state == SS_WAIT_ELECTED) - { - n_ready++; + int n_ready = 0; - if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || - (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && - wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn)) + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) { - wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); - wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; - wp->donor = i; + n_ready++; + + if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || + (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && + wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); + wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = &wp->safekeeper[i]; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } + collected = n_ready >= wp->quorum; + if (collected) + { + wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers); + } + return collected; } - return n_ready >= wp->quorum; + /* + * if generations are enabled we're expected to get to voting only when + * mconf is established. + */ + Assert(wp->mconf.generation != INVALID_GENERATION); + + /* + * We must get votes from both msets if both are present. + */ + initStringInfo(&s); + appendStringInfoString(&s, "mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp_log(LOG, "walproposer elected, %s", s.data); + collected = true; + +res: + pfree(s.data); + return collected; } /* @@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp) * that only for logical replication (and switching logical walsenders to * neon_walreader is a todo.) */ - if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) + if (!wp->api.recovery_download(wp, wp->donor)) { wp_log(FATAL, "failed to download WAL for logical replicaiton"); } @@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp) /* * Proposer's term history is the donor's + its own entry. */ - dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; + dth = &wp->donor->voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); if (dth->n_entries > 0) @@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; - wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, + wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->propTerm, LSN_FORMAT_ARGS(wp->propTermStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + wp->donor->host, wp->donor->port, LSN_FORMAT_ARGS(wp->truncateLsn)); /* @@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk) readAnything = true; + /* should never happen: sk is expected to send ERROR instead */ + if (sk->appendResponse.generation != wp->mconf.generation) + { + wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u", + sk->greetResponse.nodeId, sk->host, sk->port, + sk->appendResponse.generation, wp->mconf.generation); + } + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp) } /* - * Calculate WAL position acknowledged by quorum + * GetAcknowledgedByQuorumWALPosition for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. */ static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* - * Sort acknowledged LSNs + * Ascending sort acknowledged LSNs. */ - for (int i = 0; i < wp->n_safekeepers; i++) + Assert(mset->len <= MAX_SAFEKEEPERS); + for (uint32 i = 0; i < mset->len; i++) { + Safekeeper *sk = msk[i]; + /* * Like in Raft, we aren't allowed to commit entries from previous - * terms, so ignore reported LSN until it gets to epochStartLsn. + * terms, so ignore reported LSN until it gets to propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack flushLsn is + * 0, and later we just preserve value across reconnections. It would + * be ok to check for SS_ACTIVE as well. */ - responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn) + { + responses[i] = sk->appendResponse.flushLsn; + } + else + { + responses[i] = 0; + } } - qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn); /* - * Get the smallest LSN committed by quorum + * And get value committed by the quorum. A way to view this: to get the + * highest value committed on the quorum, in the ordered array we skip n - + * n_quorum elements to get to the first (lowest) value present on all sks + * of the highest quorum. */ - return responses[wp->n_safekeepers - wp->quorum]; + return responses[mset->len - MsetQuorum(mset)]; +} + +/* + * Calculate WAL position acknowledged by quorum, i.e. which may be regarded + * committed. + * + * Zero may be returned when there is no quorum of nodes recovered to term start + * lsn which sent feedback yet. + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +{ + XLogRecPtr committed; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to + * propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack + * flushLsn is 0, and later we just preserve value across + * reconnections. It would be ok to check for SS_ACTIVE as well. + */ + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[wp->n_safekeepers - wp->quorum]; + } + + committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers); + if (wp->mconf.new_members.len > 0) + { + XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers); + + committed = Min(committed, new_mset_committed); + } + return committed; } /* @@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp) int i; XLogRecPtr donor_lsn = InvalidXLogRecPtr; - if (wp->n_votes < wp->quorum) + if (wp->state < WPS_ELECTED) { wp_log(WARNING, "UpdateDonorShmem called before elections are won"); return; @@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp) * about its position immediately after election before any feedbacks are * sent. */ - if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED) + if (wp->donor->state >= SS_WAIT_ELECTED) { - donor = &wp->safekeeper[wp->donor]; + donor = wp->donor; donor_lsn = wp->propTermStartLsn; } @@ -1746,13 +2105,13 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } /* - * Generally sync is done when majority switched the epoch so we committed - * epochStartLsn and made the majority aware of it, ensuring they are - * ready to give all WAL to pageserver. It would mean whichever majority - * is alive, there will be at least one safekeeper who is able to stream - * WAL to pageserver to make basebackup possible. However, since at the - * moment we don't have any good mechanism of defining the healthy and - * most advanced safekeeper who should push the wal into pageserver and + * Generally sync is done when majority reached propTermStartLsn so we + * committed it and made the majority aware of it, ensuring they are ready + * to give all WAL to pageserver. It would mean whichever majority is + * alive, there will be at least one safekeeper who is able to stream WAL + * to pageserver to make basebackup possible. However, since at the moment + * we don't have any good mechanism of defining the healthy and most + * advanced safekeeper who should push the wal into pageserver and * basically the random one gets connected, to prevent hanging basebackup * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. @@ -1774,7 +2133,7 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) n_synced++; } - if (n_synced >= wp->quorum) + if (newCommitLsn >= wp->propTermStartLsn) { /* A quorum of safekeepers has been synced! */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d116bce806..648b0015ad 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -145,6 +145,7 @@ typedef uint64 NNodeId; * This and following structs pair ones in membership.rs. */ typedef uint32 Generation; +#define INVALID_GENERATION 0 typedef struct SafekeeperId { @@ -771,7 +772,17 @@ typedef struct WalProposer /* Current walproposer membership configuration */ MembershipConfiguration mconf; - /* (n_safekeepers / 2) + 1 */ + /* + * Parallels mconf.members with pointers to the member's slot in + * safekeepers array of connections, or NULL if such member is not + * connected. Helps to avoid looking slot per id through all + * .safekeepers[] when doing quorum checks. + */ + Safekeeper *members_safekeepers[MAX_SAFEKEEPERS]; + /* As above, but for new_members. */ + Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS]; + + /* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */ int quorum; /* @@ -829,7 +840,7 @@ typedef struct WalProposer term_t donorLastLogTerm; /* Most advanced acceptor */ - int donor; + Safekeeper *donor; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d000dcb69f..ba8de1c01c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -79,7 +79,12 @@ from fixtures.remote_storage import ( default_remote_storage, remote_storage_to_toml_dict, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + MembershipConfiguration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -4839,6 +4844,50 @@ class Safekeeper(LogUtils): wait_until(paused) + @staticmethod + def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]: + return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks] + + @staticmethod + def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]: + """ + List of Safekeepers which are members in `mconf`. + """ + members_ids = [m.id for m in mconf.members] + new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else [] + return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids] + + @staticmethod + def create_timeline( + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + mconf: MembershipConfiguration, + members_sks: list[Safekeeper], + ): + """ + Manually create timeline on safekeepers with given (presumably inital) + mconf: figure out LSN from pageserver, bake request and execute it on + given safekeepers. + + Normally done by storcon, but some tests want to do it manually so far. + """ + ps_http_cli = ps.http_client() + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk in members_sks: + sk.http_client().timeline_create(create_r) + class NeonBroker(LogUtils): """An object managing storage_broker instance""" diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index e409151b76..839e985419 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -25,7 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: - mconf: Configuration | None + mconf: MembershipConfiguration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -78,17 +78,17 @@ class SafekeeperId: @dataclass -class Configuration: +class MembershipConfiguration: generation: int members: list[SafekeeperId] new_members: list[SafekeeperId] | None @classmethod - def from_json(cls, d: dict[str, Any]) -> Configuration: + def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration: generation = d["generation"] members = d["members"] new_members = d.get("new_members") - return Configuration(generation, members, new_members) + return MembershipConfiguration(generation, members, new_members) def to_json(self) -> str: return json.dumps(self, cls=EnhancedJSONEncoder) @@ -98,7 +98,7 @@ class Configuration: class TimelineCreateRequest: tenant_id: TenantId timeline_id: TimelineId - mconf: Configuration + mconf: MembershipConfiguration # not exactly PgVersion, for example 150002 for 15.2 pg_version: int start_lsn: Lsn @@ -110,13 +110,13 @@ class TimelineCreateRequest: @dataclass class TimelineMembershipSwitchResponse: - previous_conf: Configuration - current_conf: Configuration + previous_conf: MembershipConfiguration + current_conf: MembershipConfiguration @classmethod def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: - previous_conf = Configuration.from_json(d["previous_conf"]) - current_conf = Configuration.from_json(d["current_conf"]) + previous_conf = MembershipConfiguration.from_json(d["previous_conf"]) + current_conf = MembershipConfiguration.from_json(d["current_conf"]) return TimelineMembershipSwitchResponse(previous_conf, current_conf) @@ -194,7 +194,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] # It is always normally not None, it is allowed only to make forward compat tests happy. - mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None + mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( mconf=mconf, term=resj["acceptor_state"]["term"], @@ -223,7 +223,9 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return self.timeline_status(tenant_id, timeline_id).commit_lsn # Get timeline membership configuration. - def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + def get_membership( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> MembershipConfiguration: # make mypy happy return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore @@ -275,7 +277,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def timeline_exclude( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> dict[str, Any]: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", @@ -287,7 +289,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): return res_json def membership_switch( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> TimelineMembershipSwitchResponse: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index e3d39f9315..a9a6699e5c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -45,7 +45,7 @@ from fixtures.remote_storage import ( s3_storage, ) from fixtures.safekeeper.http import ( - Configuration, + MembershipConfiguration, SafekeeperHttpClient, SafekeeperId, TimelineCreateRequest, @@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re for sk in env.safekeepers: sk.start() cli = sk.http_client() - mconf = Configuration(generation=0, members=[], new_members=None) + mconf = MembershipConfiguration(generation=0, members=[], new_members=None) # set start_lsn to the beginning of the first segment to allow reading # WAL from there (could you intidb LSN as well). r = TimelineCreateRequest( @@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. - init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, init_conf) @@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): http_cli.timeline_create(create_r) # Switch into some conf. - joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) log.info(f"joint switch resp: {resp}") assert resp.previous_conf.generation == 1 @@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): assert after_restart.generation == 4 # Switch into non joint conf of which sk is not a member, must fail. - non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + non_joint_not_member = MembershipConfiguration( + generation=5, members=[sk_id_2], new_members=None + ) with pytest.raises(requests.exceptions.HTTPError): resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) # Switch into good non joint conf. - non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) + non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 assert resp.current_conf.generation == 6 # Switch request to lower conf should be rejected. - lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, lower_conf) # Now, exclude sk from the membership, timeline should be deleted. - excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None) http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) with pytest.raises(requests.exceptions.HTTPError): http_cli.timeline_status(tenant_id, timeline_id) @@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - ps = env.pageservers[0] - ps_http_cli = ps.http_client() - - http_clis = [sk.http_client() for sk in env.safekeepers] - config_lines = [ "neon.safekeeper_proto_version = 3", ] @@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): # expected to fail because timeline is not created on safekeepers with pytest.raises(Exception, match=r".*timed out.*"): ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") - # figure out initial LSN. - ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) - init_lsn = ps_timeline_detail["last_record_lsn"] - log.info(f"initial LSN: {init_lsn}") - # sk timeline creation request expects minor version - pg_version = ps_timeline_detail["pg_version"] * 10000 # create inital mconf - sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] - mconf = Configuration(generation=1, members=sk_ids, new_members=None) - create_r = TimelineCreateRequest( - tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + mconf = MembershipConfiguration( + generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None ) - log.info(f"sending timeline create: {create_r.to_json()}") - - for sk_http_cli in http_clis: - sk_http_cli.timeline_create(create_r) + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers) # Once timeline created endpoint should start. ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b7c7478e78..c5dd34f64f 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -18,6 +18,7 @@ from fixtures.neon_fixtures import ( Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind +from fixtures.safekeeper.http import MembershipConfiguration from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: @@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): asyncio.run(run_concurrent_computes(env)) +async def assert_query_hangs(endpoint: Endpoint, query: str): + """ + Start on endpoint query which is expected to hang and check that it does. + """ + conn = await endpoint.connect_async() + bg_query = asyncio.create_task(conn.execute(query)) + await asyncio.sleep(2) + assert not bg_query.done() + return bg_query + + # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability( - sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 -): +async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2): # shutdown one of two acceptors, that is, majority sk.stop() - bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) - - await asyncio.sleep(start_delay_sec) - # ensure that the query has not been executed yet - assert not bg_query.done() - + bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')") # start safekeeper and await the query sk.start() await bg_query @@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint): await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down - await check_unavailability(env.safekeepers[0], conn, 2) + await check_unavailability(env.safekeepers[0], endpoint, 2) # for the world's balance, do the same with second safekeeper - await check_unavailability(env.safekeepers[1], conn, 3) + await check_unavailability(env.safekeepers[1], endpoint, 3) # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") @@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv): # insert with only one safekeeper up to create tail of flushed but not committed WAL sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 2000), 'payload'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): # insert with only one sk3 up to create tail of flushed but not committed WAL on it sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) +async def quorum_sanity_single( + env: NeonEnv, + compute_sks_ids: list[int], + members_sks_ids: list[int], + new_members_sks_ids: list[int] | None, + sks_to_stop_ids: list[int], + should_work_when_stopped: bool, +): + """ + *_ids params contain safekeeper node ids; it is assumed they are issued + from 1 and sequentially assigned to env.safekeepers. + """ + members_sks = [env.safekeepers[i - 1] for i in members_sks_ids] + new_members_sks = ( + [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None + ) + sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids] + + mconf = MembershipConfiguration( + generation=1, + members=Safekeeper.sks_to_safekeeper_ids(members_sks), + new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None, + ) + members_sks = Safekeeper.mconf_sks(env, mconf) + + tenant_id = env.initial_tenant + compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids]) + members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members]) + new_members_sks_ids_str = "-".join( + [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else [] + ) + sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop]) + log.info( + f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}" + ) + branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}" + timeline_id = env.create_branch(branch_name) + + # create timeline on `members_sks` + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks) + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create(branch_name, config_lines=config_lines) + ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids) + ep.safe_psql("create table t(key int, value text)") + + # stop specified sks and check whether writes work + for sk in sks_to_stop: + sk.stop() + if should_work_when_stopped: + log.info("checking that writes still work") + ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'") + # restarting ep should also be fine + ep.stop() + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'plum'") + bg_query = None + else: + log.info("checking that writes hang") + bg_query = await assert_query_hangs( + ep, "insert into t select generate_series(1, 100), 'Papaya'" + ) + # start again; now they should work + for sk in sks_to_stop: + sk.start() + if bg_query: + log.info("awaiting query") + await bg_query + + +# It's a bit tempting to iterate over all possible combinations, but let's stick +# with this for now. +async def run_quorum_sanity(env: NeonEnv): + # 3 members, all up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True) + # 3 members, 1/3 up, should not work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False) + + # 3 members, all up, should work; wp redundantly talks to 4th. + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False) + + # joint conf of 1-2-3 and 4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True) + # joint conf of 1-2-3 and 4, 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False) + + # joint conf of 1-2-3 and 2-3-4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True) + # joint conf of 1-2-3 and 2-3-4, 2 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True) + # joint conf of 1-2-3 and 2-3-4, 3 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False) + # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False) + + # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only. + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # with 1 down should still be ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True) + # but with 2 down not ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False) + + +# Test various combinations of membership configurations / neon.safekeepers +# (list of safekeepers endpoint connects to) values / up & down safekeepers and +# check that endpont can start and write data when we have quorum and can't when +# we don't. +def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + + asyncio.run(run_quorum_sanity(env)) + + async def run_segment_init_failure(env: NeonEnv): env.create_branch("test_segment_init_failure") ep = env.endpoints.create_start("test_segment_init_failure") From 0122d97f95b7f1ca236947e55961a307085853c2 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 10 Apr 2025 12:07:16 +0200 Subject: [PATCH 03/17] test_runner: only use last gen in `test_location_conf_churn` (#11511) ## Problem `test_location_conf_churn` performs random location updates on Pageservers. While doing this, it could instruct the compute to connect to a stale generation and execute queries. This is invalid, and will fail if a newer generation has removed layer files used by the stale generation. Resolves #11348. ## Summary of changes Only connect to the latest generation when executing queries. --- test_runner/regress/test_pageserver_secondary.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index d03d05d33d..d48e731394 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -242,7 +242,13 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, pageserver.tenant_location_configure(tenant_id, location_conf) last_state[pageserver.id] = (mode, generation) - if mode.startswith("Attached"): + # It's only valid to connect to the last generation. Newer generations may yank layer + # files used in older generations. + last_generation = max( + [s[1] for s in last_state.values() if s[1] is not None], default=None + ) + + if mode.startswith("Attached") and generation == last_generation: # This is a basic test: we are validating that he endpoint works properly _between_ # configuration changes. A stronger test would be to validate that clients see # no errors while we are making the changes. From 50631512710d8c5fd9c4c681d7882e16f4df93f3 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 10 Apr 2025 14:04:18 +0100 Subject: [PATCH 04/17] compute: Add more neon ids to compute (#11366) Pass more neon ids to compute_ctl. Expose them to postgres as neon extension GUCs: neon.project_id, neon.branch_id, neon.endpoint_id. This is the compute side PR, not yet supported by cplane. --- compute_tools/src/compute.rs | 5 ++++- compute_tools/src/config.rs | 9 +++++++++ control_plane/src/endpoint.rs | 3 +++ libs/compute_api/src/spec.rs | 6 ++++++ pgxn/neon/libpagestore.c | 28 ++++++++++++++++++++++++++++ 5 files changed, 50 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index ad8925e7ab..457ace85d1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -523,11 +523,14 @@ impl ComputeNode { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}", pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), pspec.spec.operation_uuid.as_deref().unwrap_or("None"), pspec.tenant_id, pspec.timeline_id, + pspec.spec.project_id.as_deref().unwrap_or("None"), + pspec.spec.branch_id.as_deref().unwrap_or("None"), + pspec.spec.endpoint_id.as_deref().unwrap_or("None"), pspec.spec.features, pspec.spec.remote_extensions, ); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 92939f816c..0eb8912b45 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -89,6 +89,15 @@ pub fn write_postgres_conf( escape_conf_value(&s.to_string()) )?; } + if let Some(s) = &spec.project_id { + writeln!(file, "neon.project_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.branch_id { + writeln!(file, "neon.branch_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.endpoint_id { + writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?; + } // tls if let Some(tls_config) = tls_config { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 3137bde161..663c024953 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -658,6 +658,9 @@ impl Endpoint { delta_operations: None, tenant_id: Some(self.tenant_id), timeline_id: Some(self.timeline_id), + project_id: None, + branch_id: None, + endpoint_id: Some(self.endpoint_id.clone()), mode: self.mode, pageserver_connstring: Some(pageserver_connstring), safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 994a665a88..974159ac72 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -104,6 +104,12 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + // More neon ids that we expose to the compute_ctl + // and to postgres as neon extension GUCs. + pub project_id: Option, + pub branch_id: Option, + pub endpoint_id: Option, + /// Safekeeper membership config generation. It is put in /// neon.safekeepers GUC and serves two purposes: /// 1) Non zero value forces walproposer to use membership configurations. diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 9ea708f29a..dfabb6919e 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = { /* GUCs */ char *neon_timeline; char *neon_tenant; +char *neon_project_id; +char *neon_branch_id; +char *neon_endpoint_id; int32 max_cluster_size; char *page_server_connstring; char *neon_auth_token; @@ -1352,6 +1355,31 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.project_id", + "Neon project_id the server is running on", + NULL, + &neon_project_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.branch_id", + "Neon branch_id the server is running on", + NULL, + &neon_branch_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.endpoint_id", + "Neon endpoint_id the server is running on", + NULL, + &neon_endpoint_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", "sharding stripe size", NULL, From 2e35f23085263cfb4b5dc8158086621033637163 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 10 Apr 2025 16:24:30 +0200 Subject: [PATCH 05/17] tests: remove ignored `fair` field (#11521) Pageserver has been ignoring field `tenant_config.timeline_get_throttle.fair` for many monhts, since we removed it from the config struct in neondatabase/neon#8539. Refs - epic https://github.com/neondatabase/cloud/issues/27320 --- .../test_pageserver_getpage_throttle.py | 62 ------------------- 1 file changed, 62 deletions(-) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 3d7204d883..bc186477e1 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import json import uuid from typing import TYPE_CHECKING @@ -44,7 +43,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), "max": int(rate_limit_rps / 10), - "fair": True, }, }, ) @@ -139,63 +137,3 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" ) - - -throttle_config_with_field_fair_set = { - "task_kinds": ["PageRequestHandler"], - "fair": True, - "initial": 27, - "refill_interval": "43s", - "refill_amount": 23, - "max": 42, -} - - -def assert_throttle_config_with_field_fair_set(conf): - """ - Field `fair` is ignored, so, responses don't contain it - """ - without_fair = copy.deepcopy(throttle_config_with_field_fair_set) - without_fair.pop("fair") - - assert conf == without_fair - - -def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - env = neon_env_builder.init_start() - vps_http = env.storage_controller.pageserver_api() - # with_fair config should still be settable - vps_http.set_tenant_config( - env.initial_tenant, - {"timeline_get_throttle": throttle_config_with_field_fair_set}, - ) - conf = vps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - assert_throttle_config_with_field_fair_set( - conf.tenant_specific_overrides["timeline_get_throttle"] - ) - - -def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( - neon_env_builder: NeonEnvBuilder, -): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - - def set_tenant_config(ps_cfg): - tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set - - neon_env_builder.pageserver_config_override = set_tenant_config - env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() - conf = ps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - - env.pageserver.allowed_errors.append( - r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*' - ) From f06d721a989903d6e867629c0b2216a310419af2 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 10 Apr 2025 10:53:37 -0400 Subject: [PATCH 06/17] test(pageserver): ensure gc-compaction does not fire critical errors (#11513) ## Problem Part of https://github.com/neondatabase/neon/issues/10395 ## Summary of changes Add a test case to ensure gc-compaction doesn't fire any critical errors if the key history is invalid due to partial GC. Signed-off-by: Alex Chi Z --- pageserver/src/tenant.rs | 93 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d3623fc3b9..ad4a0d804d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11571,6 +11571,99 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x24), + Value::WalRecord(NeonWalRecord::wal_append("@0x24")), + ), + ( + get_key(1), + Lsn(0x28), + // This record will fail to redo + Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // in-memory layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + delta1, + )], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let cancel = CancellationToken::new(); + + // Compaction will fail, but should not fire any critical error. + // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction + // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire + // compaction job. Tracked in . + let res = tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: None, + compact_lsn_range: None, + ..Default::default() + }, + &ctx, + ) + .await; + assert!(res.is_err()); + + Ok(()) + } + #[cfg(feature = "testing")] #[tokio::test] async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { From 5487a20b7224f81f50be62558a1037a254c4caf7 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Thu, 10 Apr 2025 16:28:28 +0100 Subject: [PATCH 07/17] compute: Set log_parameter=off for audit logging. (#11500) Log -> Base, pgaudit.log = 'ddl', pgaudit.log_parameter='off' Hipaa -> Extended. pgaudit.log = 'all, -misc', pgaudit.log_parameter='off' add new level Full: pgaudit.log='all', pgaudit.log_parameter='on' Keep old parameter names for compatibility, until cplane side changes are implemented and released. closes https://github.com/neondatabase/cloud/issues/27202 --- compute_tools/src/compute.rs | 28 ++++++++++++++++------------ compute_tools/src/config.rs | 18 +++++++++++------- compute_tools/src/spec_apply.rs | 4 ++-- libs/compute_api/src/spec.rs | 23 ++++++++++++++--------- 4 files changed, 43 insertions(+), 30 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 457ace85d1..06d5bbb9c5 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -634,19 +634,23 @@ impl ComputeNode { }); } - // Configure and start rsyslog for HIPAA if necessary - if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { - let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + // Configure and start rsyslog for compliance audit logging + match pspec.spec.audit_log_level { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { + let remote_endpoint = + std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } + + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); } - - let log_directory_path = Path::new(&self.params.pgdata).join("log"); - let log_directory_path = log_directory_path.to_string_lossy().to_string(); - configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; - - // Launch a background task to clean up the audit logs - launch_pgaudit_gc(log_directory_path); + _ => {} } // Configure and start rsyslog for Postgres logs export diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 0eb8912b45..71c6123c3b 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -178,7 +178,7 @@ pub fn write_postgres_conf( // and don't allow the user or the control plane admin to change them. match spec.audit_log_level { ComputeAudit::Disabled => {} - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { writeln!(file, "# Managed by compute_ctl base audit settings: start")?; writeln!(file, "pgaudit.log='ddl,role'")?; // Disable logging of catalog queries to reduce the noise @@ -202,16 +202,20 @@ pub fn write_postgres_conf( } writeln!(file, "# Managed by compute_ctl base audit settings: end")?; } - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { writeln!( file, "# Managed by compute_ctl compliance audit settings: begin" )?; - // This log level is very verbose - // but this is necessary for HIPAA compliance. - // Exclude 'misc' category, because it doesn't contain anythig relevant. - writeln!(file, "pgaudit.log='all, -misc'")?; - writeln!(file, "pgaudit.log_parameter=on")?; + // Enable logging of parameters. + // This is very verbose and may contain sensitive data. + if spec.audit_log_level == ComputeAudit::Full { + writeln!(file, "pgaudit.log_parameter=on")?; + writeln!(file, "pgaudit.log='all'")?; + } else { + writeln!(file, "pgaudit.log_parameter=off")?; + writeln!(file, "pgaudit.log='all, -misc'")?; + } // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. writeln!(file, "pgaudit.log_catalog=off")?; diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index e7d67f6ac5..0d1389dbad 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -278,12 +278,12 @@ impl ComputeNode { // so that all config operations are audit logged. match spec.audit_log_level { - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { phases.push(CreatePgauditExtension); phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { phases.push(CreatePgauditExtension); phases.push(DisablePostgresDBPgAudit); } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 974159ac72..82950bcbaa 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -165,13 +165,7 @@ pub struct ComputeSpec { #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, - /// Log level for audit logging: - /// - /// Disabled - no audit logging. This is the default. - /// log - log masked statements to the postgres log using pgaudit extension - /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension - /// - /// Extensions should be present in shared_preload_libraries + /// Log level for compute audit logging #[serde(default)] pub audit_log_level: ComputeAudit, @@ -295,14 +289,25 @@ impl ComputeMode { } /// Log level for audit logging -/// Disabled, log, hipaa -/// Default is Disabled #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { #[default] Disabled, + // Deprecated, use Base instead Log, + // (pgaudit.log = 'ddl', pgaudit.log_parameter='off') + // logged to the standard postgresql log stream + Base, + // Deprecated, use Full or Extended instead Hipaa, + // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off') + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access + Extended, + // (pgaudit.log='all', pgaudit.log_parameter='on'), + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access. + Full, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] From 52dee408dce3676dceb4d01c475d5e693bec2c6f Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 10 Apr 2025 17:55:37 +0100 Subject: [PATCH 08/17] storage controller: improve safety of shard splits coinciding with controller restarts (#11412) ## Problem The graceful leadership transfer process involves calling step_down on the old controller, but this was not waiting for shard splits to complete, and the new controller could therefore end up trying to abort a shard split while it was still going on. We mitigated this already in #11256 by avoiding the case where shard split completion would update the database incorrectly, but this was a fragile fix because it assumes that is the only problematic part of the split running concurrently. Precursors: - #11290 - #11256 Closes: #11254 ## Summary of changes - Hold the reconciler gate from shard splits, so that step_down will wait for them. Splits should always be fairly prompt, so it is okay to wait here. - Defense in depth: if step_down times out (hardcoded 10 second limit), then fully terminate the controller process rather than letting it continue running, potentially doing split-brainy things. This makes sense because the new controller will always declare itself leader unilaterally if step_down fails, so leaving an old controller running is not beneficial. - Tests: extend `test_storage_controller_leadership_transfer_during_split` to separately exercise the case of a split holding up step_down, and the case where the overall timeout on step_down is hit and the controller terminates. --- storage_controller/src/http.rs | 14 ++- storage_controller/src/service.rs | 51 +++++++-- .../regress/test_storage_controller.py | 107 ++++++++++++------ 3 files changed, 125 insertions(+), 47 deletions(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 0d1dc8f8ee..4f3613b687 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1235,8 +1235,18 @@ async fn handle_step_down(req: Request) -> Result, ApiError ForwardOutcome::NotForwarded(req) => req, }; - let state = get_state(&req); - json_response(StatusCode::OK, state.service.step_down().await) + // Spawn a background task: once we start stepping down, we must finish: if the client drops + // their request we should avoid stopping in some part-stepped-down state. + let handle = tokio::spawn(async move { + let state = get_state(&req); + state.service.step_down().await + }); + + let result = handle + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + json_response(StatusCode::OK, result) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 2ef09cd2e3..4790f80162 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -61,7 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::sync::gate::Gate; +use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; use crate::background_node_operations::{ @@ -594,6 +594,8 @@ struct TenantShardSplitAbort { new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant _tenant_lock: TracingExclusiveGuard, + /// The reconciler gate for the duration of the split operation, and any included abort. + _gate: GateGuard, } #[derive(thiserror::Error, Debug)] @@ -1460,7 +1462,7 @@ impl Service { // Retry until shutdown: we must keep this request object alive until it is properly // processed, as it holds a lock guard that prevents other operations trying to do things // to the tenant while it is in a weird part-split state. - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { match self.abort_tenant_shard_split(&op).await { Ok(_) => break, Err(e) => { @@ -1473,9 +1475,12 @@ impl Service { // when we retry, so that the abort op will succeed. If the abort op is failing // for some other reason, we will keep retrying forever, or until a human notices // and does something about it (either fixing a pageserver or restarting the controller). - tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) - .await - .ok(); + tokio::time::timeout( + Duration::from_secs(5), + self.reconcilers_cancel.cancelled(), + ) + .await + .ok(); } } } @@ -4910,7 +4915,7 @@ impl Service { 1, 10, Duration::from_secs(5), - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -5161,6 +5166,11 @@ impl Service { ) .await; + let _gate = self + .reconcilers_gate + .enter() + .map_err(|_| ApiError::ShuttingDown)?; + let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -5188,6 +5198,7 @@ impl Service { new_shard_count, new_stripe_size, _tenant_lock, + _gate, }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); @@ -5527,7 +5538,10 @@ impl Service { "failpoint".to_string() ))); - failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + failpoint_support::sleep_millis_async!( + "shard-split-post-remote-sleep", + &self.reconcilers_cancel + ); tracing::info!( "Split {} into {}", @@ -5585,7 +5599,7 @@ impl Service { stripe_size, preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), }, - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -8670,9 +8684,24 @@ impl Service { failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); self.inner.write().unwrap().step_down(); - // TODO: would it make sense to have a time-out for this? - self.stop_reconciliations(StopReconciliationsReason::SteppingDown) - .await; + + // Wait for reconciliations to stop, or terminate this process if they + // fail to stop in time (this indicates a bug in shutdown) + tokio::select! { + _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => { + tracing::info!("Reconciliations stopped, proceeding with step down"); + } + _ = async { + failpoint_support::sleep_millis_async!("step-down-delay-timeout"); + tokio::time::sleep(Duration::from_secs(10)).await + } => { + tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process"); + + // The caller may proceed to act as leader when it sees this request fail: reduce the chance + // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state. + std::process::exit(1); + } + } let mut global_observed = GlobalObservedState::default(); let locked = self.inner.read().unwrap(); diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index ce73c9a738..b2c8415e9a 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2892,10 +2892,12 @@ def test_storage_controller_leadership_transfer( ) +@pytest.mark.parametrize("step_down_times_out", [False, True]) def test_storage_controller_leadership_transfer_during_split( neon_env_builder: NeonEnvBuilder, storage_controller_proxy: StorageControllerProxy, port_distributor: PortDistributor, + step_down_times_out: bool, ): """ Exercise a race between shard splitting and graceful leadership transfer. This is @@ -2936,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split( ) env.storage_controller.reconcile_until_idle() + # We are testing scenarios where the step down API does not complete: either because it is stuck + # doing a shard split, or because it totally times out on some other failpoint. + env.storage_controller.allowed_errors.extend( + [ + ".*step_down.*request was dropped before completing.*", + ".*step_down.*operation timed out.*", + ".*Send step down request failed, will retry.*", + ".*Send step down request still failed after.*retries.*", + ".*Leader .+ did not respond to step-down request.*", + ] + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # Start a shard split env.storage_controller.allowed_errors.extend( @@ -2943,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split( ) pause_failpoint = "shard-split-pre-complete" env.storage_controller.configure_failpoints((pause_failpoint, "pause")) + + if not step_down_times_out: + # Prevent the timeout self-terminate code from executing: we will block step down on the + # shard split itself + env.storage_controller.configure_failpoints( + ("step-down-delay-timeout", "return(3600000)") + ) + split_fut = executor.submit( env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2 ) @@ -2961,12 +2983,20 @@ def test_storage_controller_leadership_transfer_during_split( timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port ) + if step_down_times_out: + # Step down will time out, original controller will terminate itself + env.storage_controller.allowed_errors.extend([".*terminating process.*"]) + else: + # Step down does not time out: original controller hits its shard split completion + # code path and realises that it must not purge the parent shards from the database. + env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"]) + def passed_split_abort(): try: log.info("Checking log for pattern...") - assert env.storage_controller.log_contains( - ".*Using observed state received from leader.*" - ) + # This log is indicative of entering startup_reconcile, which happens + # after the point we would abort shard splits + assert env.storage_controller.log_contains(".*Populating tenant shards.*") except Exception: log.exception("Failed to find pattern in log") raise @@ -2975,34 +3005,42 @@ def test_storage_controller_leadership_transfer_during_split( wait_until(passed_split_abort, interval=0.1, status_interval=1.0) assert env.storage_controller.log_contains(".*Aborting shard split.*") - # Proxy is still talking to original controller here: disable its pause failpoint so - # that its shard split can run to completion. - log.info("Disabling failpoint") - # Bypass the proxy: the python test HTTPServer is single threaded and still blocked - # on handling the shard split request. - env.storage_controller.request( - "PUT", - f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", - json=[{"name": "shard-split-pre-complete", "actions": "off"}], - headers=env.storage_controller.headers(TokenScope.ADMIN), - ) + if step_down_times_out: + # We will let the old controller hit a timeout path where it terminates itself, rather than + # completing step_down and trying to complete a shard split + def old_controller_terminated(): + assert env.storage_controller.log_contains(".*terminating process.*") - def previous_stepped_down(): - assert ( - env.storage_controller.get_leadership_status() - == StorageControllerLeadershipStatus.STEPPED_DOWN + wait_until(old_controller_terminated) + else: + # Proxy is still talking to original controller here: disable its pause failpoint so + # that its shard split can run to completion. + log.info("Disabling failpoint") + # Bypass the proxy: the python test HTTPServer is single threaded and still blocked + # on handling the shard split request. + env.storage_controller.request( + "PUT", + f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", + json=[{"name": "shard-split-pre-complete", "actions": "off"}], + headers=env.storage_controller.headers(TokenScope.ADMIN), ) - log.info("Awaiting step down") - wait_until(previous_stepped_down) + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) - # Let the shard split complete: this may happen _after_ the replacement has come up - # and tried to clean up the databases - log.info("Unblocking & awaiting shard split") - with pytest.raises(Exception, match="Unexpected child shard count"): - # This split fails when it tries to persist results, because it encounters - # changes already made by the new controller's abort-on-startup - split_fut.result() + log.info("Awaiting step down") + wait_until(previous_stepped_down) + + # Let the shard split complete: this may happen _after_ the replacement has come up + # and tried to clean up the databases + log.info("Unblocking & awaiting shard split") + with pytest.raises(Exception, match="Unexpected child shard count"): + # This split fails when it tries to persist results, because it encounters + # changes already made by the new controller's abort-on-startup + split_fut.result() log.info("Routing to new leader") storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -3020,13 +3058,14 @@ def test_storage_controller_leadership_transfer_during_split( env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() - # Check that the stepped down instance forwards requests - # to the new leader while it's still running. - storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_shard_dump() - env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) - status = env.storage_controller.node_status(env.pageservers[0].id) - assert status["scheduling"] == "Pause" + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_shard_dump() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): From 9c37bfc90abfb7aa653d8f3892d227ed16018492 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 10 Apr 2025 18:03:22 +0100 Subject: [PATCH 09/17] pageserver/tests: make image_layer_rewrite write less data (#11525) ## Problem This test is slow to execute, particularly if you're on a slow environment like vscode in a browser. Might have got much slower when we switched to direct IO? ## Summary of changes - Reduce the scale of the test by 10x, since there was nothing special about the original size. --- .../src/tenant/storage_layer/image_layer.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3243b73942..72992e5031 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -1192,7 +1192,7 @@ mod test { // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap(); let range = input_start..input_end; // Build an image layer to filter @@ -1235,7 +1235,7 @@ mod test { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), shard_count, - ShardStripeSize(0x8000), + ShardStripeSize(0x800), ) .unwrap(); let harness = TenantHarness::create_custom( @@ -1287,12 +1287,12 @@ mod test { // This exact size and those below will need updating as/when the layer encoding changes, but // should be deterministic for a given version of the format, as we used no randomness generating the input. - assert_eq!(original_size, 1597440); + assert_eq!(original_size, 122880); match shard_number { 0 => { // We should have written out just one stripe for our shard identity - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); // We should have dropped some of the data @@ -1300,7 +1300,7 @@ mod test { assert!(replacement.metadata().file_size > 0); // Assert that we dropped ~3/4 of the data. - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 1 => { // Shard 1 has no keys in our input range @@ -1309,19 +1309,19 @@ mod test { } 2 => { // Shard 2 has one stripes in the input range - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 3 => { // Shard 3 has two stripes in the input range - assert_eq!(wrote_keys, 0x10000); + assert_eq!(wrote_keys, 0x1000); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 811008); + assert_eq!(replacement.metadata().file_size, 73728); } _ => unreachable!(), } From 342607473ad4afcee7e199de8ee2c133c23b73b9 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 10 Apr 2025 14:55:51 -0500 Subject: [PATCH 10/17] Make Endpoint::respec_deep() infinitely deep (#11527) Because it wasn't recursive, there was a limit to the depth of updates. This work is necessary because as we teach neon_local and compute_ctl that the content in --spec-path should match a similar structure we get from the control plane, the spec object itself will no longer be toplevel. It will be under the "spec" key. Signed-off-by: Tristan Partin --- test_runner/fixtures/neon_fixtures.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index ba8de1c01c..858d367abf 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import threading import time import uuid from collections import defaultdict +from collections.abc import Mapping from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -4296,28 +4297,29 @@ class Endpoint(PgProtocol, LogUtils): def respec_deep(self, **kwargs: Any) -> None: """ - Update the endpoint.json file taking into account nested keys. - It does one level deep update. Should enough for most cases. - Distinct method from respec() to do not break existing functionality. + Update the spec.json file taking into account nested keys. + Distinct method from respec() to not break existing functionality. NOTE: This method also updates the spec.json file, not endpoint.json. We need it because neon_local also writes to spec.json, so intended use-case is i) start endpoint with some config, ii) respec_deep(), iii) call reconfigure() to apply the changes. """ + + def update(curr, patch): + for k, v in patch.items(): + if isinstance(v, Mapping): + curr[k] = update(curr.get(k, {}), v) + else: + curr[k] = v + return curr + config_path = os.path.join(self.endpoint_path(), "spec.json") with open(config_path) as f: data_dict: dict[str, Any] = json.load(f) log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4)) - for key, value in kwargs.items(): - if isinstance(value, dict): - if key not in data_dict: - data_dict[key] = value - else: - data_dict[key] = {**data_dict[key], **value} - else: - data_dict[key] = value + update(data_dict, kwargs) with open(config_path, "w") as file: log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) From 4c4e33bc2e70cb56fa796c380f58631ec6083fe6 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 11 Apr 2025 10:11:35 +0400 Subject: [PATCH 11/17] storage: add http/https server and cert resover metrics (#11450) ## Problem We need to export some metrics about certs/connections to configure alerts and make sure that all HTTP requests are gone before turning https-only mode on. - Closes: https://github.com/neondatabase/cloud/issues/25526 ## Summary of changes - Add started connection and connection error metrics to http/https Server. - Add certificate expiration time and reload metrics to ReloadingCertificateResolver. --- Cargo.lock | 1 + libs/http-utils/Cargo.toml | 1 + libs/http-utils/src/server.rs | 43 +++++++++++++ libs/http-utils/src/tls_certs.rs | 106 ++++++++++++++++++++++++++++--- pageserver/src/bin/pageserver.rs | 1 + safekeeper/src/http/mod.rs | 1 + storage_controller/src/main.rs | 1 + test_runner/regress/test_ssl.py | 61 ++++++++++++++++++ 8 files changed, 207 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aea8924f4f..5d2cdcea27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2837,6 +2837,7 @@ dependencies = [ "utils", "uuid", "workspace_hack", + "x509-cert", ] [[package]] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 6d24ee352a..5f6578f76e 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -30,6 +30,7 @@ tokio.workspace = true tracing.workspace = true url.workspace = true uuid.workspace = true +x509-cert.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs index 07fd56ac01..f93f71c962 100644 --- a/libs/http-utils/src/server.rs +++ b/libs/http-utils/src/server.rs @@ -4,6 +4,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use hyper0::Body; use hyper0::server::conn::Http; +use metrics::{IntCounterVec, register_int_counter_vec}; +use once_cell::sync::Lazy; use routerify::{RequestService, RequestServiceBuilder}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; @@ -26,6 +28,24 @@ pub struct Server { tls_acceptor: Option, } +static CONNECTION_STARTED_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_started_total", + "Number of established http/https connections", + &["scheme"] + ) + .expect("failed to define a metric") +}); + +static CONNECTION_ERROR_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_errors_total", + "Number of occured connection errors by type", + &["type"] + ) + .expect("failed to define a metric") +}); + impl Server { pub fn new( request_service: Arc>, @@ -60,6 +80,15 @@ impl Server { false } + let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]); + let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]); + let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]); + let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]); + let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]); + + let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]); + let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]); + let mut connections = FuturesUnordered::new(); loop { tokio::select! { @@ -67,6 +96,7 @@ impl Server { let (tcp_stream, remote_addr) = match stream { Ok(stream) => stream, Err(err) => { + tcp_error_cnt.inc(); if !suppress_io_error(&err) { info!("Failed to accept TCP connection: {err:#}"); } @@ -78,11 +108,18 @@ impl Server { let tls_acceptor = self.tls_acceptor.clone(); let cancel = cancel.clone(); + let tls_error_cnt = tls_error_cnt.clone(); + let http_error_cnt = http_error_cnt.clone(); + let https_error_cnt = https_error_cnt.clone(); + let http_connection_cnt = http_connection_cnt.clone(); + let https_connection_cnt = https_connection_cnt.clone(); + connections.push(tokio::spawn( async move { match tls_acceptor { Some(tls_acceptor) => { // Handle HTTPS connection. + https_connection_cnt.inc(); let tls_stream = tokio::select! { tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, _ = cancel.cancelled() => return, @@ -90,6 +127,7 @@ impl Server { let tls_stream = match tls_stream { Ok(tls_stream) => tls_stream, Err(err) => { + tls_error_cnt.inc(); if !suppress_io_error(&err) { info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } @@ -97,6 +135,7 @@ impl Server { } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + https_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } @@ -104,7 +143,9 @@ impl Server { } None => { // Handle HTTP connection. + http_connection_cnt.inc(); if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + http_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } @@ -115,6 +156,7 @@ impl Server { } Some(conn) = connections.next() => { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } @@ -122,6 +164,7 @@ impl Server { // Wait for graceful shutdown of all connections. while let Some(conn) = connections.next().await { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs index 0c18d84d98..2799db78a6 100644 --- a/libs/http-utils/src/tls_certs.rs +++ b/libs/http-utils/src/tls_certs.rs @@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration}; use anyhow::Context; use arc_swap::ArcSwap; use camino::Utf8Path; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use once_cell::sync::Lazy; use rustls::{ - pki_types::{CertificateDer, PrivateKeyDer}, + pki_types::{CertificateDer, PrivateKeyDer, UnixTime}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; +use x509_cert::der::Reader; pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { let cert_data = tokio::fs::read(filename) @@ -53,6 +56,76 @@ pub async fn load_certified_key( Ok(certified_key) } +/// rustls's CertifiedKey with extra parsed fields used for metrics. +struct ParsedCertifiedKey { + certified_key: CertifiedKey, + expiration_time: UnixTime, +} + +/// Parse expiration time from an X509 certificate. +fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result { + let parsed_cert = x509_cert::der::SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + Ok(UnixTime::since_unix_epoch( + parsed_cert + .tbs_certificate + .validity + .not_after + .to_unix_duration(), + )) +} + +async fn load_and_parse_certified_key( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, +) -> anyhow::Result { + let certified_key = load_certified_key(key_filename, cert_filename).await?; + let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?; + Ok(ParsedCertifiedKey { + certified_key, + expiration_time, + }) +} + +static CERT_EXPIRATION_TIME: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "tls_certs_expiration_time_seconds", + "Expiration time of the loaded certificate since unix epoch in seconds", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_started_total", + "Number of certificate reload loop iterations started", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_UPDATED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_updated_total", + "Number of times the certificate was updated to the new one", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_FAILED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_failed_total", + "Number of times the certificate reload failed", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from /// the disk periodically. #[derive(Debug)] @@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver { impl ReloadingCertificateResolver { /// Creates a new Resolver by loading certificate and private key from FS and /// creating tokio::task to reload them with provided reload_period. + /// resolver_name is used as metric's label. pub async fn new( + resolver_name: &str, key_filename: &Utf8Path, cert_filename: &Utf8Path, reload_period: Duration, ) -> anyhow::Result> { + // Create metrics for current resolver. + let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]); + let cert_reload_started_counter = + CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_updated_counter = + CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_failed_counter = + CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]); + + let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?; + let this = Arc::new(Self { - certified_key: ArcSwap::from_pointee( - load_certified_key(key_filename, cert_filename).await?, - ), + certified_key: ArcSwap::from_pointee(parsed_key.certified_key), }); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); tokio::spawn({ let weak_this = Arc::downgrade(&this); @@ -88,17 +173,22 @@ impl ReloadingCertificateResolver { Some(this) => this, None => break, // Resolver has been destroyed, exit. }; - match load_certified_key(&key_filename, &cert_filename).await { - Ok(new_certified_key) => { - if new_certified_key.cert == this.certified_key.load().cert { + cert_reload_started_counter.inc(); + + match load_and_parse_certified_key(&key_filename, &cert_filename).await { + Ok(parsed_key) => { + if parsed_key.certified_key.cert == this.certified_key.load().cert { tracing::debug!("Certificate has not changed since last reloading"); } else { tracing::info!("Certificate has been reloaded"); - this.certified_key.store(Arc::new(new_certified_key)); + this.certified_key.store(Arc::new(parsed_key.certified_key)); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); + cert_reload_updated_counter.inc(); } last_reload_failed = false; } Err(err) => { + cert_reload_failed_counter.inc(); // Note: Reloading certs may fail if it conflicts with the script updating // the files at the same time. Warn only if the error is persistent. if last_reload_failed { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 2740f81758..250d4180f5 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -455,6 +455,7 @@ fn start_pageserver( let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api { let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( + "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 003a75faa6..6e7c5d971d 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -31,6 +31,7 @@ pub async fn task_main_https( global_timelines: Arc, ) -> anyhow::Result<()> { let cert_resolver = ReloadingCertificateResolver::new( + "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 1aa9ae10ae..9358c9da4d 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -472,6 +472,7 @@ async fn async_main() -> anyhow::Result<()> { let https_listener = tcp_listener::bind(https_addr)?; let resolver = ReloadingCertificateResolver::new( + "main", &args.ssl_key_file, &args.ssl_cert_file, *args.ssl_cert_reload_period, diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 9a7204ca17..39c94c05a9 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,6 @@ import os import ssl +from datetime import datetime, timedelta import pytest import requests @@ -151,3 +152,63 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder): requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() cur_cert = ssl.get_server_certificate(("localhost", port)) assert cur_cert == sk_cert + + +def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder): + """ + Test metrics exported from http/https server and tls cert reloader. + """ + neon_env_builder.use_https_pageserver_api = True + neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'" + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*Error reloading certificate.*") + + ps_client = env.pageserver.http_client() + + # 1. Test connection started metric. + filter_https = {"scheme": "https"} + old_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + new_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + # The counter should increase after the request, + # but it may increase by more than one because of storcon requests. + assert new_https_conn_count > old_https_conn_count + + # 2. Test tls connection error. + # Request without specified CA cert file should fail. + with pytest.raises(requests.exceptions.SSLError): + requests.get(addr) + + tls_error_cnt = ( + ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0 + ) + assert tls_error_cnt == 1 + + # 3. Test expiration time metric. + expiration_time = datetime.fromtimestamp( + ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0 + ) + now = datetime.now() + # neon_local generates certs valid for 100 years. + # Compare with +-1 year to not care about leap years. + assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101) + + # 4. Test cert reload failed metric. + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") + assert reload_error_cnt == 0 + + os.remove(env.pageserver.workdir / "server.crt") + + def reload_failed(): + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0 + assert reload_error_cnt > 0 + + wait_until(reload_failed) From 8884865bca564033b3e1632d088f21651f1a90be Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 11 Apr 2025 11:38:05 +0200 Subject: [PATCH 12/17] tests: make `test_pageserver_getpage_throttle` less flaky (#11482) # Refs - fixes https://github.com/neondatabase/neon/issues/11395 # Problem Since 2025-03-10, we have observed increased flakiness of `test_pageserver_getpage_throttle`. The test is timing-dependent by nature, and was hitting the ``` assert duration_secs >= 10 * actual_smgr_query_seconds, ( "smgr metrics should not include throttle wait time" ) ``` quite frequently. # Analysis These failures are not reproducible. In this PR's history is a commit that reran the test 100 times without requiring a single retry. In https://github.com/neondatabase/neon/issues/11395 there is a link to a query to the test results database. It shows that the flakiness was not constant, but rather episodic: 2025-03-{10,11,12,13} 2025-03-{19,20,21} 2025-03-31 and 2025-04-01. To me, this suggests variability in available CPU. # Solution The point of the offending assertion is to ensure that most of the request latency is spent on throttling, because testing of the throttling mechanism is the point of the test. The `10` magic number means at most 10% of mean latency may be spent on request processing. Ideally we would control the passage of time (virtual clock source) to make this test deterministic. But I don't see that happening in our regression test setup. So, this PR de-flakes the test as follows: - allot up to 66% of mean latency for request processing - increase duration from 10s to 20s, hoping to get better protection from momentary CPU spikes in noisy neighbor tests or VMs on the runner host As a drive-by, switch to `pytest.approx` and remove one self-test assertion I can't make sense of anymore. --- .../test_pageserver_getpage_throttle.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index bc186477e1..5ef63e2fe9 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -15,7 +15,6 @@ if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnvBuilder, PgBin -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395") def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() @@ -96,17 +95,12 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") - duration_secs = 10 + duration_secs = 20 actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) log.info("validate the client is capped at the configured rps limit") expect_ncompleted = duration_secs * rate_limit_rps - delta_abs = abs(expect_ncompleted - actual_ncompleted) - threshold = 0.05 * expect_ncompleted - assert threshold / rate_limit_rps < 0.1 * duration_secs, ( - "test self-test: unrealistic expecations regarding precision in this test" - ) - assert delta_abs < 0.05 * expect_ncompleted, ( + assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, ( "the throttling deviates more than 5percent from the expectation" ) @@ -120,6 +114,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P timeout=compaction_period, ) + log.info("validate the metrics") smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) @@ -128,12 +123,13 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre actual_throttled_secs = actual_throttled_usecs / 1_000_000 - log.info("validate that the metric doesn't include throttle wait time") - assert duration_secs >= 10 * actual_smgr_query_seconds, ( - "smgr metrics should not include throttle wait time" + assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( + "throttling and processing latency = total request time; this assert validates thi holds on average" ) - log.info("validate that the throttling wait time metrics is correct") - assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( - "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" + # without this assertion, the test would pass even if the throttling was completely broken + # but the request processing is so slow that it makes up for the latency that a correct throttling + # implementation would add + assert actual_smgr_query_seconds < 0.66 * duration_secs, ( + "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling" ) From 979fa0682b09bcbac5d74d2f1082a12c173c676f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 11 Apr 2025 11:55:49 +0200 Subject: [PATCH 13/17] tests: update batching perf test workload to include scattered LSNs (#11391) The batching perf test workload is currently read-only sequential scans. However, realistic workloads have concurrent writes (to other pages) going on. This PR simulates concurrent writes to other pages by emitting logical replication messages. These degrade the achieved batching factor, for the reason see - https://github.com/neondatabase/neon/issues/10765 PR - https://github.com/neondatabase/neon/pull/11494 will fix this problem and get batching factor back up. --------- Co-authored-by: Vlad Lazar --- test_runner/fixtures/neon_fixtures.py | 3 +- .../pageserver/test_page_service_batching.py | 88 +++++++++++++------ 2 files changed, 65 insertions(+), 26 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 858d367abf..75a0596f58 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2884,13 +2884,14 @@ class NeonPageserver(PgProtocol, LogUtils): self, immediate: bool = False, timeout_in_seconds: int | None = None, + extra_env_vars: dict[str, str] | None = None, ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start(timeout_in_seconds=timeout_in_seconds) + self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars) self.quiesce_tenants() def quiesce_tenants(self): diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index 2c27368001..5169add6cb 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -1,5 +1,8 @@ +import concurrent.futures import dataclasses import json +import re +import threading import time from dataclasses import dataclass from pathlib import Path @@ -31,15 +34,15 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures", "tasks"] +EXECUTION = ["concurrent-futures"] NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) -BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 2, 4, 8, 16, 32]: +BATCHABLE: list[PageServicePipeliningConfig] = [] +for max_batch_size in [32]: for execution in EXECUTION: BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) @@ -47,19 +50,6 @@ for max_batch_size in [1, 2, 4, 8, 16, 32]: @pytest.mark.parametrize( "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # non-batchable workloads - # (A separate benchmark will consider latency). - *[ - ( - 50, - config, - TARGET_RUNTIME, - 1, - 128, - f"not batchable {dataclasses.asdict(config)}", - ) - for config in NON_BATCHABLE - ], # batchable workloads should show throughput and CPU efficiency improvements *[ ( @@ -137,7 +127,14 @@ def test_throughput( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # minimal lfc & small shared buffers to force requests to pageserver + "neon.max_file_cache_size=1MB", + "shared_buffers=10MB", + ], + ) conn = endpoint.connect() cur = conn.cursor() @@ -155,7 +152,6 @@ def test_throughput( tablesize = tablesize_mib * 1024 * 1024 npages = tablesize // (8 * 1024) cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? # # Run the workload, collect `Metrics` before and after, calculate difference, normalize. @@ -211,31 +207,73 @@ def test_throughput( ).value, ) - def workload() -> Metrics: + def workload(disruptor_started: threading.Event) -> Metrics: + disruptor_started.wait() start = time.time() iters = 0 while time.time() - start < target_runtime or iters < 2: - log.info("Seqscan %d", iters) if iters == 1: # round zero for warming up before = get_metrics() - cur.execute( - "select clear_buffer_cache()" - ) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests cur.execute("select sum(data::bigint) from t") assert cur.fetchall()[0][0] == npages * (npages + 1) // 2 iters += 1 after = get_metrics() return (after - before).normalize(iters - 1) + def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event): + conn = endpoint.connect() + cur = conn.cursor() + iters = 0 + while True: + cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')") + if stop_disruptor.is_set(): + break + disruptor_started.set() + iters += 1 + time.sleep(0.001) + return iters + env.pageserver.patch_config_toml_nonrecursive( {"page_service_pipelining": dataclasses.asdict(pipelining_config)} ) - env.pageserver.restart() - metrics = workload() + + # set trace for log analysis below + env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"}) + + log.info("Starting workload") + + with concurrent.futures.ThreadPoolExecutor() as executor: + disruptor_started = threading.Event() + stop_disruptor = threading.Event() + disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor) + workload_fut = executor.submit(workload, disruptor_started) + metrics = workload_fut.result() + stop_disruptor.set() + ndisruptions = disruptor_fut.result() + log.info("Disruptor issued %d disrupting requests", ndisruptions) log.info("Results: %s", metrics) + since_last_start: list[str] = [] + for line in env.pageserver.logfile.read_text().splitlines(): + if "git:" in line: + since_last_start = [] + since_last_start.append(line) + + stopping_batching_because_re = re.compile( + r"stopping batching because (LSN changed|of batch size|timeline object mismatch|batch key changed|same page was requested at different LSNs|.*)" + ) + reasons_for_stopping_batching = {} + for line in since_last_start: + match = stopping_batching_because_re.search(line) + if match: + if match.group(1) not in reasons_for_stopping_batching: + reasons_for_stopping_batching[match.group(1)] = 0 + reasons_for_stopping_batching[match.group(1)] += 1 + + log.info("Reasons for stopping batching: %s", reasons_for_stopping_batching) + # # Sanity-checks on the collected data # From 3c8565a1941b3ad4cfb0c1de068d7e4bec7baf0a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 11 Apr 2025 13:31:12 +0200 Subject: [PATCH 14/17] test_runner: propagate config via `attach_hook` for test fix (#11529) ## Problem The `pagebench` benchmarks set up an initial dataset by creating a template tenant, copying the remote storage to a bunch of new tenants, and attaching them to Pageservers. In #11420, we found that `test_pageserver_characterize_throughput_with_n_tenants` had degraded performance because it set a custom tenant config in Pageservers that was then replaced with the default tenant config by the storage controller. The initial fix was to register the tenants directly in the storage controller, but this created the tenants with generation 1. This broke `test_basebackup_with_high_slru_count`, where the template tenant was at generation 2, leading to all layer files at generation 2 being ignored. Resolves #11485. Touches #11381. ## Summary of changes This patch addresses both test issues by modifying `attach_hook` to also take a custom tenant config. This allows attaching tenants to Pageservers from pre-existing remote storage, specifying both the generation and tenant config when registering them in the storage controller. --- control_plane/src/storage_controller.rs | 8 +++++-- storage_controller/src/service.rs | 24 ++++++++++--------- test_runner/fixtures/neon_fixtures.py | 12 ++++++---- .../fixtures/pageserver/many_tenants.py | 12 ++++------ .../pagebench/test_large_slru_basebackup.py | 12 ++++------ 5 files changed, 36 insertions(+), 32 deletions(-) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 8000576e87..a4b56ae5c0 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -13,7 +13,9 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs { pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, - pub generation_override: Option, + pub generation_override: Option, // only new tenants + pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] @@ -805,6 +808,7 @@ impl StorageController { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, + config: None, }; let response = self diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4790f80162..0982e56155 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1852,6 +1852,7 @@ impl Service { }; if insert { + let config = attach_req.config.clone().unwrap_or_default(); let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, @@ -1860,7 +1861,7 @@ impl Service { generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), @@ -1883,16 +1884,16 @@ impl Service { Ok(()) => { tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( + let mut shard = TenantShard::new( attach_req.tenant_shard_id, - TenantShard::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Attached(0), - None, - ), + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + None, ); + shard.config = config; + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert(attach_req.tenant_shard_id, shard); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); } } @@ -1977,11 +1978,12 @@ impl Service { .set_attached(scheduler, attach_req.node_id); tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", + "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}", attach_req.tenant_shard_id, tenant_shard.generation, // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)), + attach_req.config, ); // Trick the reconciler into not doing anything for this tenant: this helps diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 75a0596f58..9d4068b583 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1986,10 +1986,13 @@ class NeonStorageController(MetricsGetter, LogUtils): tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, generation_override: int | None = None, + config: None | dict[str, Any] = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: body["generation_override"] = generation_override + if config is not None: + body["config"] = config response = self.request( "POST", @@ -2980,11 +2983,12 @@ class NeonPageserver(PgProtocol, LogUtils): to call into the pageserver HTTP client. """ client = self.http_client() - if generation is None: - generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) - elif override_storage_controller_generation: + if generation is None or override_storage_controller_generation: generation = self.env.storage_controller.attach_hook_issue( - tenant_id, self.id, generation + tenant_id, + self.id, + generation_override=generation if override_storage_controller_generation else None, + config=config, ) return client.tenant_attach( tenant_id, diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index eedb693e3d..71c750b9eb 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -65,13 +65,11 @@ def single_timeline( assert ps_http.tenant_list() == [] def attach(tenant): - # NB: create the new tenant in the storage controller with the correct tenant config. This - # will pick up the existing tenant data from remote storage. If we just attach it to the - # Pageserver, the storage controller will reset the tenant config to the default. - env.create_tenant( - tenant_id=tenant, - timeline_id=template_timeline, - conf=template_config, + env.pageserver.tenant_attach( + tenant, + config=template_config, + generation=100, + override_storage_controller_generation=True, ) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index efd423104d..8af52dcbd0 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -66,11 +66,11 @@ def test_basebackup_with_high_slru_count( n_txns = 500000 - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, n_txns) - env = setup_pageserver_with_tenants( - neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + neon_env_builder, + f"large_slru_count-{n_tenants}-{n_txns}", + n_tenants, + lambda env: setup_tenant_template(env, n_txns), ) run_benchmark(env, pg_bin, record, duration) @@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } template_tenant, template_timeline = env.create_tenant(set_default=True) From a6937a3281e7431c46e31720d1e3337f96b867c8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 11 Apr 2025 14:14:08 +0200 Subject: [PATCH 15/17] pageserver: improve shard ancestor compaction logging (#11535) ## Problem Shard ancestor compaction always logs "starting shard ancestor compaction", even if there is no work to do. This is very spammy (every 20 seconds for every shard). It also has limited progress logging. ## Summary of changes * Only log "starting shard ancestor compaction" when there's work to do. * Include details about the amount of work. * Log progress messages for each layer, and when waiting for uploads. * Log when compaction is completed, with elapsed duration and whether there is more work for a later iteration. --- pageserver/src/tenant/timeline/compaction.rs | 52 +++++++++++++++----- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 7b1969f209..c6f0e32494 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1140,6 +1140,7 @@ impl Timeline { ) -> Result<(), CompactionError> { let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); + let mut rewrite_max_exceeded: bool = false; // We will use the Lsn cutoff of the last GC as a threshold for rewriting layers: if a // layer is behind this Lsn, it indicates that the layer is being retained beyond the @@ -1148,12 +1149,7 @@ impl Timeline { // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); - - tracing::info!( - "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}", - *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.time - ); + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; let layers = self.layers.read().await; for layer_desc in layers.layer_map()?.iter_historic_layers() { @@ -1171,8 +1167,8 @@ impl Timeline { // This ancestral layer only covers keys that belong to other shards. // We include the full metadata in the log: if we had some critical bug that caused // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. - info!(%layer, old_metadata=?layer.metadata(), - "dropping layer after shard split, contains no keys for this shard.", + debug!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard", ); if cfg!(debug_assertions) { @@ -1234,9 +1230,10 @@ impl Timeline { } if layers_to_rewrite.len() >= rewrite_max { - tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", layers_to_rewrite.len() ); + rewrite_max_exceeded = true; continue; } @@ -1244,9 +1241,24 @@ impl Timeline { layers_to_rewrite.push(layer); } - // Drop read lock on layer map before we start doing time-consuming I/O + // Drop read lock on layer map before we start doing time-consuming I/O. drop(layers); + // Drop out early if there's nothing to do. + if layers_to_rewrite.is_empty() && drop_layers.is_empty() { + return Ok(()); + } + + info!( + "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \ + (latest_gc_cutoff={} pitr_cutoff={})", + layers_to_rewrite.len(), + drop_layers.len(), + *latest_gc_cutoff, + pitr_cutoff, + ); + let started = Instant::now(); + let mut replace_image_layers = Vec::new(); for layer in layers_to_rewrite { @@ -1254,7 +1266,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + info!(layer=%layer, "rewriting layer after shard split"); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, @@ -1292,7 +1304,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let new_layer = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; - tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + info!(layer=%new_layer, "rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -1304,6 +1316,12 @@ impl Timeline { } } + for layer in &drop_layers { + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split (no keys for this shard)", + ); + } + // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded // metadata to reflect that. If we restart here, the replaced layer files will look invalid (size mismatch // to remote index) and be removed. This is inefficient but safe. @@ -1319,6 +1337,7 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. + info!("shard ancestor compaction waiting for uploads"); match self.remote_client.wait_completion().await { Ok(()) => (), Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), @@ -1327,6 +1346,15 @@ impl Timeline { } } + info!( + "shard ancestor compaction done in {:.3}s{}", + started.elapsed().as_secs_f64(), + match rewrite_max_exceeded { + true => format!(", more work pending due to rewrite_max={rewrite_max}"), + false => String::new(), + } + ); + fail::fail_point!("compact-shard-ancestors-persistent"); Ok(()) From 88f01c1ca1738cc0843f250150dc7e4071e43a2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 11 Apr 2025 16:08:46 +0200 Subject: [PATCH 16/17] Introduce WalIngestError (#11506) Introduces a `WalIngestError` struct together with a `WalIngestErrorKind` enum, to be used for walingest related failures and errors. * the enum captures backtraces, so we don't regress in comparison to `anyhow::Error`s (backtraces might be a bit shorter if we use one of the `anyhow::Error` wrappers) * it explicitly lists most/all of the potential cases that can occur. I've originally been inspired to do this in #11496, but it's a longer-term TODO. --- libs/pageserver_api/src/key.rs | 15 +- pageserver/src/http/routes.rs | 2 +- pageserver/src/import_datadir.rs | 8 +- pageserver/src/pgdatadir_mapping.rs | 170 ++++++++---------- .../walreceiver/connection_manager.rs | 1 + .../walreceiver/walreceiver_connection.rs | 11 +- pageserver/src/walingest.rs | 154 ++++++++++++---- 7 files changed, 229 insertions(+), 132 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8836e7ec87..0c4d7fd4cb 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -927,7 +927,7 @@ impl Key { /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] - pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> { Ok(match self.field1 { 0x00 => ( RelTag { @@ -938,7 +938,7 @@ impl Key { }, self.field6, ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + _ => return Err(ToRelBlockError(self.field1)), }) } } @@ -951,6 +951,17 @@ impl std::str::FromStr for Key { } } +#[derive(Debug)] +pub struct ToRelBlockError(u8); + +impl fmt::Display for ToRelBlockError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "unexpected value kind 0x{:02x}", self.0) + } +} + +impl std::error::Error for ToRelBlockError {} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9bb761dc48..bbc4bfae1b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3253,7 +3253,7 @@ async fn ingest_aux_files( modification .put_file(&fname, content.as_bytes(), &ctx) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; } modification .commit(&ctx) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6dd005de50..911449c7c5 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -27,7 +27,7 @@ use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; -use crate::walingest::WalIngest; +use crate::walingest::{WalIngest, WalIngestErrorKind}; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { @@ -157,9 +157,9 @@ async fn import_rel( .put_rel_creation(rel, nblocks as u32, ctx) .await { - match e { - RelationError::AlreadyExists => { - debug!("Relation {} already exist. We must be extending it.", rel) + match e.kind { + WalIngestErrorKind::RelationAlreadyExists(rel) => { + debug!("Relation {rel} already exists. We must be extending it.") } _ => return Err(e.into()), } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4c5a07ba57..f33a8baec1 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -9,8 +9,9 @@ use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; -use crate::PERF_TRACE_TARGET; -use anyhow::{Context, ensure}; +use crate::walingest::{WalIngestError, WalIngestErrorKind}; +use crate::{PERF_TRACE_TARGET, ensure_walingest}; +use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use itertools::Itertools; @@ -136,12 +137,8 @@ impl From for CalculateLogicalSizeError { #[derive(Debug, thiserror::Error)] pub enum RelationError { - #[error("Relation Already Exists")] - AlreadyExists, #[error("invalid relnode")] InvalidRelnode, - #[error(transparent)] - Other(#[from] anyhow::Error), } /// @@ -1478,8 +1475,8 @@ impl DatadirModification<'_> { } /// Set the current lsn - pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { - ensure!( + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> { + ensure_walingest!( lsn >= self.lsn, "setting an older lsn {} than {} is not allowed", lsn, @@ -1578,7 +1575,7 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, ctx: &RequestContext, - ) -> Result { + ) -> Result { // Get current size and put rel creation if rel doesn't exist // // NOTE: we check the cache first even though get_rel_exists and get_rel_size would @@ -1593,14 +1590,13 @@ impl DatadirModification<'_> { .await? { // create it with 0 size initially, the logic below will extend it - self.put_rel_creation(rel, 0, ctx) - .await - .context("Relation Error")?; + self.put_rel_creation(rel, 0, ctx).await?; Ok(0) } else { - self.tline + Ok(self + .tline .get_rel_size(rel, Version::Modified(self), ctx) - .await + .await?) } } @@ -1637,11 +1633,14 @@ impl DatadirModification<'_> { // TODO(vlad): remove this argument and replace the shard check with is_key_local shard: &ShardIdentity, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let mut gaps_at_lsns = Vec::default(); for meta in batch.metadata.iter() { - let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?; + let key = Key::from_compact(meta.key()); + let (rel, blkno) = key + .to_rel_block() + .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?; let new_nblocks = blkno + 1; let old_nblocks = self.create_relation_if_required(rel, ctx).await?; @@ -1683,8 +1682,8 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -1696,7 +1695,7 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { if !self.tline.tenant_shard_id.is_shard_zero() { return Ok(()); } @@ -1714,14 +1713,11 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) @@ -1733,15 +1729,12 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(key, Value::Image(img)); Ok(()) @@ -1751,15 +1744,11 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, blknum: BlockNumber, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1776,15 +1765,11 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1832,8 +1817,10 @@ impl DatadirModification<'_> { dbnode: Oid, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; @@ -1874,13 +1861,13 @@ impl DatadirModification<'_> { xid: u64, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1891,7 +1878,7 @@ impl DatadirModification<'_> { let xid = xid as u32; let mut dir = TwoPhaseDirectory::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1909,22 +1896,22 @@ impl DatadirModification<'_> { &mut self, origin_id: RepOriginId, origin_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = repl_origin_key(origin_id); self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); Ok(()) } - pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> { self.set_replorigin(origin_id, Lsn::INVALID).await } - pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } @@ -1934,7 +1921,7 @@ impl DatadirModification<'_> { spcnode: Oid, dbnode: Oid, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let total_blocks = self .tline .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) @@ -1973,20 +1960,21 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> Result<(), RelationError> { + ) -> Result<(), WalIngestError> { if rel.relnode == 0 { - return Err(RelationError::InvalidRelnode); + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( + "invalid relnode" + )))?; } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) - .context("deserialize db")?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), @@ -2003,27 +1991,25 @@ impl DatadirModification<'_> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; - let v2_enabled = self.maybe_enable_rel_size_v2()?; + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_enabled { if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 - let val = self - .sparse_get(sparse_rel_dir_key, ctx) - .await - .map_err(|e| RelationError::Other(e.into()))?; + let val = self.sparse_get(sparse_rel_dir_key, ctx).await?; let val = RelDirExists::decode_option(val) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?; if val == RelDirExists::Exists { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } self.put( sparse_rel_dir_key, @@ -2039,9 +2025,7 @@ impl DatadirModification<'_> { // will be key not found errors if we don't create an empty one for rel_size_v2. self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&RelDirectory::default()).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)), ); } self.pending_directory_entries @@ -2049,7 +2033,7 @@ impl DatadirModification<'_> { } else { // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } if !dbdir_exists { self.pending_directory_entries @@ -2059,9 +2043,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), ); } @@ -2086,8 +2068,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline .get_rel_exists(rel, Version::Modified(self), ctx) @@ -2117,8 +2099,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); // Put size let size_key = rel_size_to_key(rel); @@ -2142,8 +2124,10 @@ impl DatadirModification<'_> { &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2163,7 +2147,7 @@ impl DatadirModification<'_> { let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?; if val == RelDirExists::Exists { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); @@ -2206,7 +2190,7 @@ impl DatadirModification<'_> { segno: u32, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Add it to the directory entry @@ -2215,7 +2199,7 @@ impl DatadirModification<'_> { let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - anyhow::bail!("slru segment {kind:?}/{segno} already exists"); + Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?; } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), @@ -2242,7 +2226,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Put size @@ -2258,7 +2242,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -2283,7 +2267,7 @@ impl DatadirModification<'_> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> { // TODO Ok(()) } @@ -2293,7 +2277,7 @@ impl DatadirModification<'_> { &mut self, xid: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { @@ -2308,7 +2292,8 @@ impl DatadirModification<'_> { )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { - let xid: u32 = u32::try_from(xid)?; + let xid: u32 = u32::try_from(xid) + .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -2333,7 +2318,7 @@ impl DatadirModification<'_> { path: &str, content: &[u8], ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine let old_val = match self.get(key, ctx).await { @@ -2342,7 +2327,7 @@ impl DatadirModification<'_> { Err(e) => return Err(e.into()), }; let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)? } else { Vec::new() }; @@ -2387,7 +2372,8 @@ impl DatadirModification<'_> { } (None, true) => warn!("removing non-existing aux file: {}", path), } - let new_val = aux_file::encode_file_value(&new_files)?; + let new_val = aux_file::encode_file_value(&new_files) + .map_err(WalIngestErrorKind::EncodeAuxFileError)?; self.put(key, Value::Image(new_val.into())); Ok(()) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index df2663f6bb..3c3608d1bd 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -580,6 +580,7 @@ impl ConnectionManagerState { ); Ok(()) } + WalReceiverError::Cancelled => Ok(()), WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 6bf05a0f86..52259f205b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -73,6 +73,7 @@ pub(super) enum WalReceiverError { /// Generic error Other(anyhow::Error), ClosedGate, + Cancelled, } impl From for WalReceiverError { @@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::Cancelled => { + debug!("Connection cancelled") + } WalReceiverError::ClosedGate => { // doesn't happen at runtime } @@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) + .await + .map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let shard = vec![*timeline.get_shard_identity()]; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 18df065f76..e60c590f87 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,13 +21,13 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::backtrace::Backtrace; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; -use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -38,7 +38,7 @@ use postgres_ffi::{ fsm_logical_to_physical, pg_constants, }; use tracing::*; -use utils::bin_ser::SerializeError; +use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; @@ -104,12 +104,101 @@ struct WarnIngestLag { timestamp_invalid_msg_ratelimit: RateLimit, } +pub struct WalIngestError { + pub backtrace: std::backtrace::Backtrace, + pub kind: WalIngestErrorKind, +} + +#[derive(thiserror::Error, Debug)] +pub enum WalIngestErrorKind { + #[error(transparent)] + #[allow(private_interfaces)] + PageReconstructError(#[from] PageReconstructError), + #[error(transparent)] + DeserializationFailure(#[from] DeserializeError), + #[error(transparent)] + SerializationFailure(#[from] SerializeError), + #[error("the request contains data not supported by pageserver: {0} @ {1}")] + InvalidKey(Key, Lsn), + #[error("twophase file for xid {0} already exists")] + FileAlreadyExists(u64), + #[error("slru segment {0:?}/{1} already exists")] + SlruAlreadyExists(SlruKind, u32), + #[error("relation already exists")] + RelationAlreadyExists(RelTag), + #[error("invalid reldir key {0}")] + InvalidRelDirKey(Key), + + #[error(transparent)] + LogicalError(anyhow::Error), + #[error(transparent)] + EncodeAuxFileError(anyhow::Error), + #[error(transparent)] + MaybeRelSizeV2Error(anyhow::Error), + + #[error("timeline shutting down")] + Cancelled, +} + +impl From for WalIngestError +where + WalIngestErrorKind: From, +{ + fn from(value: T) -> Self { + WalIngestError { + backtrace: Backtrace::capture(), + kind: WalIngestErrorKind::from(value), + } + } +} + +impl std::error::Error for WalIngestError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.kind.source() + } +} + +impl core::fmt::Display for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +impl core::fmt::Debug for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if f.alternate() { + f.debug_map() + .key(&"backtrace") + .value(&self.backtrace) + .key(&"kind") + .value(&self.kind) + .finish() + } else { + writeln!(f, "Error: {:?}", self.kind)?; + if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured { + writeln!(f, "Stack backtrace: {:?}", self.backtrace)?; + } + Ok(()) + } + } +} + +#[macro_export] +macro_rules! ensure_walingest { + ($($t:tt)*) => { + _ = || -> Result<(), anyhow::Error> { + anyhow::ensure!($($t)*); + Ok(()) + }().map_err(WalIngestErrorKind::LogicalError)?; + }; +} + impl WalIngest { pub async fn new( timeline: &Timeline, startpoint: Lsn, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -145,7 +234,7 @@ impl WalIngest { interpreted: InterpretedWalRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { WAL_INGEST.records_received.inc(); let prev_len = modification.len(); @@ -288,7 +377,7 @@ impl WalIngest { } /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL - fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { let next_full_xid = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); @@ -298,9 +387,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!( + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" - ); + )))?; } epoch -= 1; } @@ -313,7 +402,7 @@ impl WalIngest { clear_vm_bits: ClearVmBits, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClearVmBits { new_heap_blkno, old_heap_blkno, @@ -402,7 +491,7 @@ impl WalIngest { create: DbaseCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseCreate { db_id, tablespace_id, @@ -505,7 +594,7 @@ impl WalIngest { dbase_drop: DbaseDrop, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseDrop { db_id, tablespace_ids, @@ -523,7 +612,7 @@ impl WalIngest { create: SmgrCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let SmgrCreate { rel } = create; self.put_rel_creation(modification, rel, ctx).await?; Ok(()) @@ -537,7 +626,7 @@ impl WalIngest { truncate: XlSmgrTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let XlSmgrTruncate { blkno, rnode, @@ -689,7 +778,7 @@ impl WalIngest { record: XactRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { let xid: u64 = if modification.tline.pg_version >= 17 { @@ -813,7 +902,7 @@ impl WalIngest { truncate: ClogTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogTruncate { pageno, oldest_xid, @@ -889,7 +978,7 @@ impl WalIngest { zero_page: ClogZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogZeroPage { segno, rpageno } = zero_page; self.put_slru_page_image( @@ -907,7 +996,7 @@ impl WalIngest { &mut self, modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { // Create WAL record for updating the multixact-offsets page let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1010,7 +1099,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let (maxsegment, startsegment, endsegment) = enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestMulti = xlrec.end_trunc_off; @@ -1058,7 +1147,7 @@ impl WalIngest { zero_page: MultiXactZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let MultiXactZeroPage { slru_kind, segno, @@ -1080,7 +1169,7 @@ impl WalIngest { update: RelmapUpdate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RelmapUpdate { update, buf } = update; modification @@ -1093,7 +1182,7 @@ impl WalIngest { raw_record: RawXlogRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RawXlogRecord { info, lsn, mut buf } = raw_record; let pg_version = modification.tline.pg_version; @@ -1235,12 +1324,12 @@ impl WalIngest { put: PutLogicalMessage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let PutLogicalMessage { path, buf } = put; modification.put_file(path.as_str(), &buf, ctx).await } - fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> { + fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> { match record { StandbyRecord::RunningXacts(running_xacts) => { enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { @@ -1258,7 +1347,7 @@ impl WalIngest { &mut self, record: ReploriginRecord, modification: &mut DatadirModification<'_>, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { match record { ReploriginRecord::Set(set) => { modification @@ -1278,7 +1367,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } @@ -1291,7 +1380,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_page_image(rel, blknum, img)?; @@ -1305,7 +1394,7 @@ impl WalIngest { blknum: BlockNumber, rec: NeonWalRecord, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_wal_record(rel, blknum, rec)?; @@ -1318,7 +1407,7 @@ impl WalIngest { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } @@ -1329,7 +1418,7 @@ impl WalIngest { rel: RelTag, blknum: BlockNumber, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. @@ -1423,7 +1512,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { if !self.shard.is_shard_zero() { return Ok(()); } @@ -1441,7 +1530,7 @@ impl WalIngest { segno: u32, blknum: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1509,6 +1598,7 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { + use anyhow::Result; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1530,7 +1620,7 @@ mod tests { } #[tokio::test] - async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { for i in 14..=16 { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; From c66444ea1538349d13ab5e87bca880394434004b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 11 Apr 2025 16:10:27 +0200 Subject: [PATCH 17/17] Add timeline_import http endpoint (#11484) The added `timleine_import` endpoint allows us to migrate safekeeper timelines from control plane managed to storcon managed. Part of #9011 --- libs/pageserver_api/src/controller_api.rs | 12 +++++- storage_controller/src/http.rs | 42 +++++++++++++++++++ .../src/service/safekeeper_service.rs | 30 ++++++++++++- 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 3cb62f9d18..91f9c03ba4 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -7,7 +7,8 @@ use std::time::{Duration, Instant}; /// API (`/control/v1` prefix). Implemented by the server /// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; use crate::shard::{ShardStripeSize, TenantShardId}; @@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, } +/// Import request for safekeeper timelines. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineImportRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub start_lsn: Lsn, + pub sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 4f3613b687..fb4530d0d2 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,6 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, + TimelineImportRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -1286,6 +1287,37 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE ) } +async fn handle_timeline_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let import_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}", + import_req.tenant_id, + import_req.timeline_id + ))); + } + + json_response( + StatusCode::OK, + state.service.timeline_import(import_req).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1959,6 +1991,16 @@ pub fn make_router( RequestName("debug_v1_tenant_locate"), ) }) + .post( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import", + |r| { + named_request_span( + r, + handle_timeline_import, + RequestName("debug_v1_timeline_import"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 099d0305ba..a23b9a4a02 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -12,13 +12,16 @@ use crate::persistence::{ use crate::safekeeper::Safekeeper; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::controller_api::{ + SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, +}; use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::membership::{MemberSet, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; +use utils::lsn::Lsn; use super::Service; @@ -298,6 +301,31 @@ impl Service { timeline_id, }) } + + /// Directly insert the timeline into the database without reconciling it with safekeepers. + /// + /// Useful if the timeline already exists on the specified safekeepers, + /// but we want to make it storage controller managed. + pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> { + let persistence = TimelinePersistence { + tenant_id: req.tenant_id.to_string(), + timeline_id: req.timeline_id.to_string(), + start_lsn: Lsn::INVALID.into(), + generation: 1, + sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(), + new_sk_set: None, + cplane_notified_generation: 1, + deleted_at: None, + }; + let inserted = self.persistence.insert_timeline(persistence).await?; + if inserted { + tracing::info!("imported timeline into db"); + } else { + tracing::info!("didn't import timeline into db, as it is already present in db"); + } + Ok(()) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc,