From 47553dbaf946de9cdab759c220ec56dffa5f82cd Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Sat, 28 Jun 2025 16:59:29 +0400 Subject: [PATCH 01/19] neon_local: set timeline_safekeeper_count if we have less than 3 safekeepers (#12378) ## Problem - Closes: https://github.com/neondatabase/neon/issues/12298 ## Summary of changes - Set `timeline_safekeeper_count` in `neon_local` if we have less than 3 safekeepers - Remove `cfg!(feature = "testing")` code from `safekeepers_for_new_timeline` - Change `timeline_safekeeper_count` type to `usize` --- control_plane/src/local_env.rs | 2 +- control_plane/src/storage_controller.rs | 8 +++++++- storage_controller/src/main.rs | 4 ++-- storage_controller/src/service.rs | 2 +- storage_controller/src/service/safekeeper_service.rs | 9 +-------- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 16cd2d8c08..d0611113e8 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -212,7 +212,7 @@ pub struct NeonStorageControllerConf { pub use_local_compute_notifications: bool, - pub timeline_safekeeper_count: Option, + pub timeline_safekeeper_count: Option, pub posthog_config: Option, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index dea7ae2ccf..bb83a6319c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -638,7 +638,13 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } - if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + // neon_local is used in test environments where we often have less than 3 safekeepers. + if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 { + let sk_cnt = self + .config + .timeline_safekeeper_count + .unwrap_or(self.env.safekeepers.len()); + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index d1c2858d6f..752262b65e 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -216,8 +216,8 @@ struct Cli { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. /// This option exists primarily for testing purposes. - #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] - timeline_safekeeper_count: i64, + #[arg(long, default_value = "3", value_parser = clap::builder::RangedU64ValueParser::::new().range(1..))] + timeline_safekeeper_count: usize, /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. /// This speed up migrations by avoiding the default wait for the heatmap download interval. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b4dfd01249..19bb0f8671 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -471,7 +471,7 @@ pub struct Config { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. - pub timeline_safekeeper_count: i64, + pub timeline_safekeeper_count: usize, /// PostHog integration config pub posthog_config: Option, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fec81fb661..92d15f3fca 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,4 +1,3 @@ -use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -654,13 +653,7 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let mut wanted_count = self.config.timeline_safekeeper_count as usize; - // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release - // branch and is specified in tests/neon_local config. - if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { - // In testing mode, we can have less safekeepers than the config says - wanted_count = max(all_safekeepers.len(), 1); - } + let wanted_count = self.config.timeline_safekeeper_count; let mut sks = Vec::new(); let mut azs = HashSet::new(); From 9bb4688c541fd66b3a342ac4f1cd0784bd807fbd Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Mon, 30 Jun 2025 09:41:05 +0400 Subject: [PATCH 02/19] storcon: Remove testing feature from kick_secondary_downloads (#12383) ## Problem Some of the design decisions in PR #12256 were influenced by the requirements of consistency tests. These decisions introduced intermediate logic that is no longer needed and should be cleaned up. ## Summary of Changes - Remove the `feature("testing")` flag related to `kick_secondary_download`. - Set the default value of `kick_secondary_download` back to false, reflecting the intended production behavior. Co-authored-by: Aleksandr Sarantsev --- storage_controller/src/main.rs | 8 ++------ storage_controller/src/service.rs | 4 +--- test_runner/fixtures/neon_fixtures.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 752262b65e..2a851dc25b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -6,9 +6,7 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8PathBuf; -#[cfg(feature = "testing")] -use clap::ArgAction; -use clap::Parser; +use clap::{ArgAction, Parser}; use futures::future::OptionFuture; use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; @@ -222,8 +220,7 @@ struct Cli { /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. /// This speed up migrations by avoiding the default wait for the heatmap download interval. /// Primarily useful for testing to reduce test execution time. - #[cfg(feature = "testing")] - #[arg(long, default_value = "true", action=ArgAction::Set)] + #[arg(long, default_value = "false", action=ArgAction::Set)] kick_secondary_downloads: bool, } @@ -472,7 +469,6 @@ async fn async_main() -> anyhow::Result<()> { use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, posthog_config: posthog_config.clone(), - #[cfg(feature = "testing")] kick_secondary_downloads: args.kick_secondary_downloads, }; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 19bb0f8671..75ce7bc37b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -476,7 +476,7 @@ pub struct Config { /// PostHog integration config pub posthog_config: Option, - #[cfg(feature = "testing")] + /// When set, actively checks and initiates heatmap downloads/uploads. pub kick_secondary_downloads: bool, } @@ -8364,7 +8364,6 @@ impl Service { "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" ); - #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { // No heatmap might mean the attached location has never uploaded one, or that // the secondary download hasn't happened yet. This is relatively unusual in the field, @@ -8389,7 +8388,6 @@ impl Service { /// happens on multi-minute timescales in the field, which is fine because optimisation is meant /// to be a lazy background thing. However, when testing, it is not practical to wait around, so /// we have this helper to move things along faster. - #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { if !self.config.kick_secondary_downloads { // No-op if kick_secondary_downloads functionaliuty is not configured diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4eb85119ca..48c6597c7c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -453,7 +453,7 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, - storcon_kick_secondary_downloads: bool | None = None, + storcon_kick_secondary_downloads: bool | None = True, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override From c746678bbc215937e5b3dbead0697386e0815780 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Mon, 30 Jun 2025 12:30:05 +0400 Subject: [PATCH 03/19] storcon: implement safekeeper_migrate handler (#11849) This PR implements a safekeeper migration algorithm from RFC-035 https://github.com/neondatabase/neon/blob/main/docs/rfcs/035-safekeeper-dynamic-membership-change.md#change-algorithm - Closes: https://github.com/neondatabase/neon/issues/11823 It is not production-ready yet, but I think it's good enough to commit and start testing. There are some known issues which will be addressed in later PRs: - https://github.com/neondatabase/neon/issues/12186 - https://github.com/neondatabase/neon/issues/12187 - https://github.com/neondatabase/neon/issues/12188 - https://github.com/neondatabase/neon/issues/12189 - https://github.com/neondatabase/neon/issues/12190 - https://github.com/neondatabase/neon/issues/12191 - https://github.com/neondatabase/neon/issues/12192 ## Summary of changes - Implement `tenant_timeline_safekeeper_migrate` handler to drive the migration - Add possibility to specify number of safekeepers per timeline in tests (`timeline_safekeeper_count`) - Add `term` and `flush_lsn` to `TimelineMembershipSwitchResponse` - Implement compare-and-swap (CAS) operation over timeline in DB for updating membership configuration safely. - Write simple test to verify that migration code works --- libs/pageserver_api/src/controller_api.rs | 5 + libs/safekeeper_api/src/models.rs | 4 +- safekeeper/src/state.rs | 14 +- safekeeper/src/timeline.rs | 9 +- safekeeper/src/wal_storage.rs | 6 +- storage_controller/src/http.rs | 39 +- storage_controller/src/metrics.rs | 2 + storage_controller/src/persistence.rs | 54 ++ storage_controller/src/safekeeper.rs | 8 + storage_controller/src/safekeeper_client.rs | 4 + storage_controller/src/service.rs | 2 + .../src/service/safekeeper_reconciler.rs | 2 +- .../src/service/safekeeper_service.rs | 646 ++++++++++++++++-- test_runner/fixtures/neon_fixtures.py | 22 + .../regress/test_safekeeper_migration.py | 64 ++ 15 files changed, 802 insertions(+), 79 deletions(-) create mode 100644 test_runner/regress/test_safekeeper_migration.py diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ff18d40bfe..a8080a57e9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -546,6 +546,11 @@ pub struct TimelineImportRequest { pub sk_set: Vec, } +#[derive(serde::Serialize, serde::Deserialize, Clone)] +pub struct TimelineSafekeeperMigrateRequest { + pub new_sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 5c1ee41f7b..1774489c1c 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -210,7 +210,7 @@ pub struct TimelineStatus { } /// Request to switch membership configuration. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] #[serde(transparent)] pub struct TimelineMembershipSwitchRequest { pub mconf: Configuration, @@ -221,6 +221,8 @@ pub struct TimelineMembershipSwitchRequest { pub struct TimelineMembershipSwitchResponse { pub previous_conf: Configuration, pub current_conf: Configuration, + pub term: Term, + pub flush_lsn: Lsn, } #[derive(Clone, Copy, Serialize, Deserialize)] diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index b6cf73be2e..32624d260d 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -9,7 +9,7 @@ use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::models::TimelineTermBumpResponse; use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -83,6 +83,11 @@ pub enum EvictionState { Offloaded(Lsn), } +pub struct MembershipSwitchResult { + pub previous_conf: Configuration, + pub current_conf: Configuration, +} + impl TimelinePersistentState { /// commit_lsn is the same as start_lsn in the normal creaiton; see /// `TimelineCreateRequest` comments.` @@ -261,10 +266,7 @@ where /// Switch into membership configuration `to` if it is higher than the /// current one. - pub async fn membership_switch( - &mut self, - to: Configuration, - ) -> Result { + pub async fn membership_switch(&mut self, to: Configuration) -> Result { let before = self.mconf.clone(); // Is switch allowed? if to.generation <= self.mconf.generation { @@ -278,7 +280,7 @@ where self.finish_change(&state).await?; info!("switched membership conf to {} from {}", to, before); } - Ok(TimelineMembershipSwitchResponse { + Ok(MembershipSwitchResult { previous_conf: before, current_conf: self.mconf.clone(), }) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2bee41537f..0a27876862 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -190,7 +190,14 @@ impl StateSK { &mut self, to: Configuration, ) -> Result { - self.state_mut().membership_switch(to).await + let result = self.state_mut().membership_switch(to).await?; + + Ok(TimelineMembershipSwitchResponse { + previous_conf: result.previous_conf, + current_conf: result.current_conf, + term: self.state().acceptor_state.term, + flush_lsn: self.flush_lsn(), + }) } /// Close open WAL files to release FDs. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 33310706be..70e53d86ee 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -154,8 +154,8 @@ pub struct PhysicalStorage { /// record /// /// Partial segment 002 has no WAL records, and it will be removed by the - /// next truncate_wal(). This flag will be set to true after the first - /// truncate_wal() call. + /// next truncate_wal(). This flag will be set to false after the first + /// successful truncate_wal() call. /// /// [`write_lsn`]: Self::write_lsn pending_wal_truncation: bool, @@ -202,6 +202,8 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { + // note: can never happen. find_end_of_wal returns provided start_lsn + // (state.commit_lsn in our case) if it doesn't find anything. bail!( "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a7e86b5224..66c44b5674 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,7 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, - TimelineImportRequest, + TimelineImportRequest, TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -34,6 +34,7 @@ use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; + use routerify::Middleware; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -635,6 +636,32 @@ async fn handle_tenant_timeline_download_heatmap_layers( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_safekeeper_migrate( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let migrate_req = json_request::(&mut req).await?; + + service + .tenant_timeline_safekeeper_migrate(tenant_id, timeline_id, migrate_req) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_lsn_lease( service: Arc, req: Request, @@ -2458,6 +2485,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_safekeeper_migrate, + RequestName("v1_tenant_timeline_safekeeper_migrate"), + ) + }, + ) // LSN lease passthrough to all shards .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 07713c3fbc..f7f77cdd23 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -333,6 +333,7 @@ pub(crate) enum DatabaseErrorLabel { ConnectionPool, Logical, Migration, + Cas, } impl DatabaseError { @@ -343,6 +344,7 @@ impl DatabaseError { Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, Self::Logical(_) => DatabaseErrorLabel::Logical, Self::Migration(_) => DatabaseErrorLabel::Migration, + Self::Cas(_) => DatabaseErrorLabel::Cas, } } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 2948e9019f..56f4d03111 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -29,6 +29,7 @@ use pageserver_api::shard::{ use rustls::client::WebPkiServerVerifier; use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; +use safekeeper_api::membership::SafekeeperGeneration; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -94,6 +95,8 @@ pub(crate) enum DatabaseError { Logical(String), #[error("Migration error: {0}")] Migration(String), + #[error("CAS error: {0}")] + Cas(String), } #[derive(measured::FixedCardinalityLabel, Copy, Clone)] @@ -126,6 +129,7 @@ pub(crate) enum DatabaseOperation { UpdateLeader, SetPreferredAzs, InsertTimeline, + UpdateTimelineMembership, GetTimeline, InsertTimelineReconcile, RemoveTimelineReconcile, @@ -1410,6 +1414,56 @@ impl Persistence { .await } + /// Update timeline membership configuration in the database. + /// Perform a compare-and-swap (CAS) operation on the timeline's generation. + /// The `new_generation` must be the next (+1) generation after the one in the database. + pub(crate) async fn update_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_generation: SafekeeperGeneration, + sk_set: &[NodeId], + new_sk_set: Option<&[NodeId]>, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let prev_generation = new_generation.previous().unwrap(); + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| { + Box::pin(async move { + let updated = diesel::update(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::generation.eq(prev_generation.into_inner() as i32)) + .set(( + dsl::generation.eq(new_generation.into_inner() as i32), + dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::>()), + dsl::new_sk_set.eq(new_sk_set + .map(|set| set.iter().map(|id| id.0 as i64).collect::>())), + )) + .execute(conn) + .await?; + + match updated { + 0 => { + // TODO(diko): It makes sense to select the current generation + // and include it in the error message for better debuggability. + Err(DatabaseError::Cas( + "Failed to update membership configuration".to_string(), + )) + } + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))), + } + }) + }) + .await + } + /// Load timeline from db. Returns `None` if not present. pub(crate) async fn get_timeline( &self, diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 5a13ef750e..91154f4fa3 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -2,6 +2,7 @@ use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use reqwest::StatusCode; +use safekeeper_api::membership::SafekeeperId; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -92,6 +93,13 @@ impl Safekeeper { pub(crate) fn has_https_port(&self) -> bool { self.listen_https_port.is_some() } + pub(crate) fn get_safekeeper_id(&self) -> SafekeeperId { + SafekeeperId { + id: self.id, + host: self.skp.host.clone(), + pg_port: self.skp.port as u16, + } + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index bcf223c731..47a785e7d3 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -56,6 +56,10 @@ impl SafekeeperClient { } } + pub(crate) fn node_id_label(&self) -> &str { + &self.node_id_label + } + pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 75ce7bc37b..bbf93fd751 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -161,6 +161,7 @@ enum TenantOperations { DropDetached, DownloadHeatmapLayers, TimelineLsnLease, + TimelineSafekeeperMigrate, } #[derive(Clone, strum_macros::Display)] @@ -491,6 +492,7 @@ impl From for ApiError { DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => { ApiError::InternalServerError(anyhow::anyhow!(reason)) } + DatabaseError::Cas(reason) => ApiError::Conflict(reason), } } } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index a3c5082be6..b67a679fad 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -145,7 +145,7 @@ pub(crate) async fn load_schedule_requests( } let Some(sk) = safekeepers.get(&other_node_id) else { tracing::warn!( - "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it" + "couldn't find safekeeper with pending op id {other_node_id}, not pulling from it" ); return None; }; diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 92d15f3fca..fc33a24198 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -5,20 +5,29 @@ use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; use crate::heartbeater::SafekeeperState; +use crate::id_lock_map::trace_shared_lock; use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::safekeeper_client::SafekeeperClient; +use crate::service::TenantOperations; use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, + TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; -use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; +use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; +use safekeeper_api::models::{ + PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, +}; +use safekeeper_api::{INITIAL_TERM, Term}; +use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; @@ -35,6 +44,33 @@ pub struct TimelineLocateResponse { } impl Service { + fn make_member_set(safekeepers: &[Safekeeper]) -> Result { + let members = safekeepers + .iter() + .map(|sk| sk.get_safekeeper_id()) + .collect::>(); + + MemberSet::new(members).map_err(ApiError::InternalServerError) + } + + fn get_safekeepers(&self, ids: &[i64]) -> Result, ApiError> { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + ids.iter() + .map(|&id| { + let node_id = NodeId(id as u64); + safekeepers.get(&node_id).cloned().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "safekeeper {node_id} is not registered" + )) + }) + }) + .collect::, _>>() + } + /// Timeline creation on safekeepers /// /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, @@ -47,35 +83,9 @@ impl Service { pg_version: PgVersionId, timeline_persistence: &TimelinePersistence, ) -> Result, ApiError> { - // If quorum is reached, return if we are outside of a specified timeout - let jwt = self - .config - .safekeeper_jwt_token - .clone() - .map(SecretString::from); - let mut joinset = JoinSet::new(); + let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?; - // Prepare membership::Configuration from choosen safekeepers. - let safekeepers = { - let locked = self.inner.read().unwrap(); - locked.safekeepers.clone() - }; - - let mut members = Vec::new(); - for sk_id in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk_id as u64); - let Some(safekeeper) = safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find entry for safekeeper with id {sk_id}" - )))?; - }; - members.push(SafekeeperId { - id: sk_id, - host: safekeeper.skp.host.clone(), - pg_port: safekeeper.skp.port as u16, - }); - } - let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mset = Self::make_member_set(&safekeepers)?; let mconf = safekeeper_api::membership::Configuration::new(mset); let req = safekeeper_api::models::TimelineCreateRequest { @@ -88,79 +98,150 @@ impl Service { timeline_id, wal_seg_size: None, }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - for sk in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk as u64); - let safekeepers = safekeepers.clone(); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + &safekeepers, + move |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + Ok(results + .into_iter() + .enumerate() + .filter_map(|(idx, res)| { + if res.is_ok() { + None // Success, don't return this safekeeper + } else { + Some(safekeepers[idx].get_id()) // Failure, return this safekeeper + } + }) + .collect::>()) + } + + /// Perform an operation on a list of safekeepers in parallel with retries. + /// + /// Return the results of the operation on each safekeeper in the input order. + async fn tenant_timeline_safekeeper_op( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F + Send + 'static, + O: Clone, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + for (idx, sk) in safekeepers.iter().enumerate() { + let sk = sk.clone(); let http_client = self.http_client.clone(); let jwt = jwt.clone(); - let req = req.clone(); + let op = op.clone(); joinset.spawn(async move { - // Unwrap is fine as we already would have returned error above - let sk_p = safekeepers.get(&sk_id).unwrap(); - let res = sk_p + let res = sk .with_client_retries( - |client| { - let req = req.clone(); - async move { client.create_timeline(&req).await } - }, + op, &http_client, &jwt, 3, 3, - SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + // TODO(diko): This is a wrong timeout. + // It should be scaled to the retry count. + timeout, &CancellationToken::new(), ) .await; - (sk_id, sk_p.skp.host.clone(), res) + (idx, res) }); } + + // Initialize results with timeout errors in case we never get a response. + let mut results: Vec> = safekeepers + .iter() + .map(|_| { + Err(mgmt_api::Error::Timeout( + "safekeeper operation timed out".to_string(), + )) + }) + .collect(); + // After we have built the joinset, we now wait for the tasks to complete, // but with a specified timeout to make sure we return swiftly, either with // a failure or success. - let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + let reconcile_deadline = tokio::time::Instant::now() + timeout; // Wait until all tasks finish or timeout is hit, whichever occurs // first. - let mut reconcile_results = Vec::new(); + let mut result_count = 0; loop { if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await { let Some(res) = res else { break }; match res { - Ok(res) => { + Ok((idx, res)) => { + let sk = &safekeepers[idx]; tracing::info!( "response from safekeeper id:{} at {}: {:?}", - res.0, - res.1, - res.2 + sk.get_id(), + sk.skp.host, + // Only print errors, as there is no Debug trait for T. + res.as_ref().map(|_| ()), ); - reconcile_results.push(res); + results[idx] = res; + result_count += 1; } Err(join_err) => { tracing::info!("join_err for task in joinset: {join_err}"); } } } else { - tracing::info!( - "timeout for creation call after {} responses", - reconcile_results.len() - ); + tracing::info!("timeout for operation call after {result_count} responses",); break; } } - // Now check now if quorum was reached in reconcile_results. - let total_result_count = reconcile_results.len(); - let remaining = reconcile_results - .into_iter() - .filter_map(|res| res.2.is_err().then_some(res.0)) - .collect::>(); - tracing::info!( - "Got {} non-successful responses from initial creation request of total {total_result_count} responses", - remaining.len() - ); - let target_sk_count = timeline_persistence.sk_set.len(); + Ok(results) + } + + /// Perform an operation on a list of safekeepers in parallel with retries, + /// and validates that we reach a quorum of successful responses. + /// + /// Return the results of the operation on each safekeeper in the input order. + /// It's guaranteed that at least a quorum of the responses are successful. + async fn tenant_timeline_safekeeper_op_quorum( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F, + O: Clone + Send + 'static, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let results = self + .tenant_timeline_safekeeper_op(safekeepers, op, timeout) + .await?; + + // Now check if quorum was reached in results. + + let target_sk_count = safekeepers.len(); let quorum_size = match target_sk_count { 0 => { return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -179,7 +260,7 @@ impl Service { // in order to schedule work to them tracing::warn!( "couldn't find at least 3 safekeepers for timeline, found: {:?}", - timeline_persistence.sk_set + target_sk_count ); return Err(ApiError::InternalServerError(anyhow::anyhow!( "couldn't find at least 3 safekeepers to put timeline to" @@ -188,7 +269,7 @@ impl Service { } _ => target_sk_count / 2 + 1, }; - let success_count = target_sk_count - remaining.len(); + let success_count = results.iter().filter(|res| res.is_ok()).count(); if success_count < quorum_size { // Failure return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -196,7 +277,7 @@ impl Service { ))); } - Ok(remaining) + Ok(results) } /// Create timeline in controller database and on safekeepers. @@ -797,4 +878,435 @@ impl Service { } Ok(()) } + + /// Call `switch_timeline_membership` on all safekeepers with retries + /// till the quorum of successful responses is reached. + /// + /// If min_position is not None, validates that majority of safekeepers + /// reached at least min_position. + /// + /// Return responses from safekeepers in the input order. + async fn tenant_timeline_set_membership_quorum( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + min_position: Option<(Term, Lsn)>, + ) -> Result>, ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + safekeepers, + move |client| { + let req = req.clone(); + async move { + let mut res = client + .switch_timeline_membership(tenant_id, timeline_id, &req) + .await; + + // If min_position is not reached, map the response to an error, + // so it isn't counted toward the quorum. + if let Some(min_position) = min_position { + if let Ok(ok_res) = &res { + if (ok_res.term, ok_res.flush_lsn) < min_position { + // Use Error::Timeout to make this error retriable. + res = Err(mgmt_api::Error::Timeout( + format!( + "safekeeper {} returned position {:?} which is less than minimum required position {:?}", + client.node_id_label(), + (ok_res.term, ok_res.flush_lsn), + min_position + ) + )); + } + } + } + + res + } + }, + SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + for res in results.iter().flatten() { + if res.current_conf.generation > config.generation { + // Antoher switch_membership raced us. + return Err(ApiError::Conflict(format!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, config.generation + ))); + } else if res.current_conf.generation < config.generation { + // Note: should never happen. + // If we get a response, it should be at least the sent generation. + tracing::error!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ))); + } + } + + Ok(results) + } + + /// Pull timeline to to_safekeepers from from_safekeepers with retries. + /// + /// Returns Ok(()) only if all the pull_timeline requests were successful. + async fn tenant_timeline_pull_from_peers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + to_safekeepers: &[Safekeeper], + from_safekeepers: &[Safekeeper], + ) -> Result<(), ApiError> { + let http_hosts = from_safekeepers + .iter() + .map(|sk| sk.base_url()) + .collect::>(); + + tracing::info!( + "pulling timeline to {:?} from {:?}", + to_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>(), + from_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>() + ); + + // TODO(diko): need to pass mconf/generation with the request + // to properly handle tombstones. Ignore tombstones for now. + // Worst case: we leave a timeline on a safekeeper which is not in the current set. + let req = PullTimelineRequest { + tenant_id, + timeline_id, + http_hosts, + ignore_tombstone: Some(true), + }; + + const SK_PULL_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let responses = self + .tenant_timeline_safekeeper_op( + to_safekeepers, + move |client| { + let req = req.clone(); + async move { client.pull_timeline(&req).await } + }, + SK_PULL_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + if let Some((idx, err)) = responses + .iter() + .enumerate() + .find_map(|(idx, res)| Some((idx, res.as_ref().err()?))) + { + let sk_id = to_safekeepers[idx].get_id(); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "pull_timeline to {sk_id} failed: {err}", + ))); + } + + Ok(()) + } + + /// Exclude a timeline from safekeepers in parallel with retries. + /// If an exclude request is unsuccessful, it will be added to + /// the reconciler, and after that the function will succeed. + async fn tenant_timeline_safekeeper_exclude( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + ) -> Result<(), ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op( + safekeepers, + move |client| { + let req = req.clone(); + async move { client.exclude_timeline(tenant_id, timeline_id, &req).await } + }, + SK_EXCLUDE_TIMELINE_TIMEOUT, + ) + .await?; + + let mut reconcile_requests = Vec::new(); + + for (idx, res) in results.iter().enumerate() { + if res.is_err() { + let sk_id = safekeepers[idx].skp.id; + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: config.generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + sk_id, + }; + tracing::info!("writing pending exclude op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + + let req = ScheduleRequest { + safekeeper: Box::new(safekeepers[idx].clone()), + host_list: Vec::new(), + tenant_id, + timeline_id: Some(timeline_id), + generation: config.generation.into_inner(), + kind: SafekeeperTimelineOpKind::Exclude, + }; + reconcile_requests.push(req); + } + } + + if !reconcile_requests.is_empty() { + let locked = self.inner.read().unwrap(); + for req in reconcile_requests { + locked.safekeeper_reconcilers.schedule_request(req); + } + } + + Ok(()) + } + + /// Migrate timeline safekeeper set to a new set. + /// + /// This function implements an algorithm from RFC-035. + /// + pub(crate) async fn tenant_timeline_safekeeper_migrate( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineSafekeeperMigrateRequest, + ) -> Result<(), ApiError> { + let all_safekeepers = self.inner.read().unwrap().safekeepers.clone(); + + let new_sk_set = req.new_sk_set; + + for sk_id in new_sk_set.iter() { + if !all_safekeepers.contains_key(sk_id) { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "safekeeper {sk_id} does not exist" + ))); + } + } + + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineSafekeeperMigrate, + ) + .await; + + // 1. Fetch current timeline configuration from the configuration storage. + + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table" + ) + .into(), + )); + }; + + let cur_sk_set = timeline + .sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .collect::>(); + + tracing::info!( + ?cur_sk_set, + ?new_sk_set, + "Migrating timeline to new safekeeper set", + ); + + let mut generation = SafekeeperGeneration::new(timeline.generation as u32); + + if let Some(ref presistent_new_sk_set) = timeline.new_sk_set { + // 2. If it is already joint one and new_set is different from desired_set refuse to change. + if presistent_new_sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .ne(new_sk_set.iter().cloned()) + { + tracing::info!( + ?presistent_new_sk_set, + ?new_sk_set, + "different new safekeeper set is already set in the database", + ); + return Err(ApiError::Conflict(format!( + "the timeline is already migrating to a different safekeeper set: {presistent_new_sk_set:?}" + ))); + } + // It it is the same new_sk_set, we can continue the migration (retry). + } else { + // 3. No active migration yet. + // Increment current generation and put desired_set to new_sk_set. + generation = generation.next(); + + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &cur_sk_set, + Some(&new_sk_set), + ) + .await?; + } + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?; + + let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::>(); + let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?; + let new_sk_member_set = Self::make_member_set(&new_safekeepers)?; + + let joint_config = membership::Configuration { + generation, + members: cur_sk_member_set, + new_members: Some(new_sk_member_set.clone()), + }; + + // 4. Call PUT configuration on safekeepers from the current set, + // delivering them joint_conf. + + // TODO(diko): need to notify cplane with an updated set of safekeepers. + + let results = self + .tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &cur_safekeepers, + &joint_config, + None, // no min position + ) + .await?; + + let mut sync_position = (INITIAL_TERM, Lsn::INVALID); + for res in results.into_iter().flatten() { + let sk_position = (res.term, res.flush_lsn); + if sync_position < sk_position { + sync_position = sk_position; + } + } + + tracing::info!( + %generation, + ?sync_position, + "safekeepers set membership updated", + ); + + // 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet + // by doing pull_timeline from the majority of the current set. + + // Filter out safekeepers which are already in the current set. + let from_ids: HashSet = cur_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let pull_to_safekeepers = new_safekeepers + .iter() + .filter(|sk| !from_ids.contains(&sk.get_id())) + .cloned() + .collect::>(); + + self.tenant_timeline_pull_from_peers( + tenant_id, + timeline_id, + &pull_to_safekeepers, + &cur_safekeepers, + ) + .await?; + + // 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough. + + // TODO(diko): do we need to bump timeline term? + + // 7. Repeatedly call PUT configuration on safekeepers from the new set, + // delivering them joint_conf and collecting their positions. + + tracing::info!(?sync_position, "waiting for safekeepers to sync position"); + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &joint_config, + Some(sync_position), + ) + .await?; + + // 8. Create new_conf: Configuration incrementing joint_conf generation and + // having new safekeeper set as sk_set and None new_sk_set. + + let generation = generation.next(); + + let new_conf = membership::Configuration { + generation, + members: new_sk_member_set, + new_members: None, + }; + + self.persistence + .update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None) + .await?; + + // TODO(diko): at this point we have already updated the timeline in the database, + // but we still need to notify safekeepers and cplane about the new configuration, + // and put delition of the timeline from the old safekeepers into the reconciler. + // Ideally it should be done atomically, but now it's not. + // Worst case: the timeline is not deleted from old safekeepers, + // the compute may require both quorums till the migration is retried and completed. + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &new_conf, + None, // no min position + ) + .await?; + + let new_ids: HashSet = new_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let exclude_safekeepers = cur_safekeepers + .into_iter() + .filter(|sk| !new_ids.contains(&sk.get_id())) + .collect::>(); + self.tenant_timeline_safekeeper_exclude( + tenant_id, + timeline_id, + &exclude_safekeepers, + &new_conf, + ) + .await?; + + // TODO(diko): need to notify cplane with an updated set of safekeepers. + + Ok(()) + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 48c6597c7c..508e3d8dd2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1215,6 +1215,13 @@ class NeonEnv: storage_controller_config = storage_controller_config or {} storage_controller_config["use_https_safekeeper_api"] = True + # TODO(diko): uncomment when timeline_safekeeper_count option is in the release branch, + # so the compat tests will not fail bacause of it presence. + # if config.num_safekeepers < 3: + # storage_controller_config = storage_controller_config or {} + # if "timeline_safekeeper_count" not in storage_controller_config: + # storage_controller_config["timeline_safekeeper_count"] = config.num_safekeepers + if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config @@ -2226,6 +2233,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"timeline_create success: {response.json()}") + def migrate_safekeepers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_sk_set: list[int], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", + json={"new_sk_set": new_sk_set}, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"migrate_safekeepers success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py new file mode 100644 index 0000000000..f67b6afc95 --- /dev/null +++ b/test_runner/regress/test_safekeeper_migration.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): + """ + Simple safekeeper migration test. + Creates 3 safekeepers. The timeline is configuret to use only one safekeeper. + 1. Go through all safekeepers, migrate the timeline to it. + 2. Stop the other safekeepers. Validate that the insert is successful. + 3. Start the other safekeepers again and go to the next safekeeper. + 4. Validate that the table contains all inserted values. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + # TODO(diko): pageserver spams with various errors during safekeeper migration. + # Fix the code so it handles the migration better. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was cancelled and cannot be used anymore.*", + ".*Timeline .* has been deleted.*", + ".*wal receiver task finished with an error.*", + ] + ) + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + # We specify all safekeepers, so compute will connect to all of them. + # Only those from the current membership configuration will be used. + # TODO(diko): set only current safekeepers when cplane notify is implemented. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + + for active_sk in range(1, 4): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [active_sk] + ) + + other_sks = [sk for sk in range(1, 4) if sk != active_sk] + + for sk in other_sks: + env.safekeepers[sk - 1].stop() + + ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})") + + for sk in other_sks: + env.safekeepers[sk - 1].start() + + ep.clear_buffers() + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + + ep.stop() + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] From 1d43f3bee80f1b8ffd7e6c6576e7c280510996c7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 11:08:44 +0200 Subject: [PATCH 04/19] pageserver: fix stripe size persistence in legacy HTTP handlers (#12377) ## Problem Similarly to #12217, the following endpoints may result in a stripe size mismatch between the storage controller and Pageserver if an unsharded tenant has a different stripe size set than the default. This can lead to data corruption if the tenant is later manually split without specifying an explicit stripe size, since the storage controller and Pageserver will apply different defaults. This commonly happens with tenants that were created before the default stripe size was changed from 32k to 2k. * `PUT /v1/tenant/config` * `PATCH /v1/tenant/config` These endpoints are no longer in regular production use (they were used when cplane still managed Pageserver directly), but can still be called manually or by tests. ## Summary of changes Retain the current shard parameters when updating the location config in `PUT | PATCH /v1/tenant/config`. Also opportunistically derive `Copy` for `ShardParameters`. --- libs/pageserver_api/src/models.rs | 15 +++++++++++++-- libs/pageserver_api/src/shard.rs | 2 +- pageserver/src/http/routes.rs | 4 ++-- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/tenant.rs | 6 +++++- pageserver/src/tenant/config.rs | 2 +- pageserver/src/tenant/timeline/handle.rs | 14 +++++++------- safekeeper/src/handler.rs | 2 +- storage_controller/src/service.rs | 4 ++-- 9 files changed, 33 insertions(+), 18 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 82a3ac0eb4..16545364c1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -21,7 +21,9 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; -use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{ + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId, +}; /// The state of a tenant in this pageserver. /// @@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse { } /// Parameters that apply to all shards in a tenant. Used during tenant creation. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Clone, Copy, Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct ShardParameters { pub count: ShardCount, @@ -497,6 +499,15 @@ impl Default for ShardParameters { } } +impl From for ShardParameters { + fn from(identity: ShardIdentity) -> Self { + Self { + count: identity.count, + stripe_size: identity.stripe_size, + } + } +} + #[derive(Debug, Default, Clone, Eq, PartialEq)] pub enum FieldPatch { Upsert(T), diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 9c16be93e8..a9fe3dac43 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -179,7 +179,7 @@ impl ShardIdentity { /// For use when creating ShardIdentity instances for new shards, where a creation request /// specifies the ShardParameters that apply to all shards. - pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self { + pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self { Self { number, count: params.count, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index aa9bec657c..f770e420f0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1893,7 +1893,7 @@ async fn update_tenant_config_handler( let location_conf = LocationConf::attached_single( new_tenant_conf.clone(), tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) @@ -1937,7 +1937,7 @@ async fn patch_tenant_config_handler( let location_conf = LocationConf::attached_single( updated, tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 09a7a8a651..31f38d485f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -3015,7 +3015,7 @@ mod tests { // This shard will get the even blocks let shard = ShardIdentity::from_params( ShardNumber(0), - &ShardParameters { + ShardParameters { count: ShardCount(2), stripe_size: ShardStripeSize(1), }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2e9dbdc539..79bea4eb77 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3872,6 +3872,10 @@ impl TenantShard { &self.tenant_shard_id } + pub(crate) fn get_shard_identity(&self) -> ShardIdentity { + self.shard_identity + } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { self.shard_identity.stripe_size } @@ -6008,7 +6012,7 @@ pub(crate) mod harness { AttachedTenantConf::try_from(LocationConf::attached_single( self.tenant_conf.clone(), self.generation, - &ShardParameters::default(), + ShardParameters::default(), )) .unwrap(), self.shard_identity, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c5087f7e0f..46cc669400 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -136,7 +136,7 @@ impl LocationConf { pub(crate) fn attached_single( tenant_conf: pageserver_api::models::TenantConfig, generation: Generation, - shard_params: &models::ShardParameters, + shard_params: models::ShardParameters, ) -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 809b350f38..2dbff20ab2 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -887,7 +887,7 @@ mod tests { .expect("we still have it"); } - fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key { rel_block_to_key( RelTag { spcnode: 1663, @@ -917,14 +917,14 @@ mod tests { let child0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + shard: ShardIdentity::from_params(ShardNumber(0), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child1 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + shard: ShardIdentity::from_params(ShardNumber(1), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); @@ -937,7 +937,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -961,7 +961,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![], // doesn't matter what's in here, the cache is fully loaded }, @@ -978,7 +978,7 @@ mod tests { let parent_handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -995,7 +995,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop }, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 5e7f1d8758..373589a18e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -220,7 +220,7 @@ impl postgres_backend::Handler stripe_size: ShardStripeSize(stripe_size), }; self.shard = - Some(ShardIdentity::from_params(ShardNumber(number), ¶ms)); + Some(ShardIdentity::from_params(ShardNumber(number), params)); } _ => { return Err(QueryError::Other(anyhow::anyhow!( diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index bbf93fd751..e0b13c4e63 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2584,7 +2584,7 @@ impl Service { .do_initial_shard_scheduling( tenant_shard_id, initial_generation, - &create_req.shard_parameters, + create_req.shard_parameters, create_req.config.clone(), placement_policy.clone(), preferred_az_id.as_ref(), @@ -2641,7 +2641,7 @@ impl Service { &self, tenant_shard_id: TenantShardId, initial_generation: Option, - shard_params: &ShardParameters, + shard_params: ShardParameters, config: TenantConfig, placement_policy: PlacementPolicy, preferred_az_id: Option<&AvailabilityZone>, From 620d50432cbadc240b834b65be3f221c462e29b0 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Mon, 30 Jun 2025 11:33:57 +0200 Subject: [PATCH 05/19] Fix path issue in the proxy-bernch CI workflow (#12388) --- .github/workflows/proxy-benchmark.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml index 75ecacaced..3a98ad4e8e 100644 --- a/.github/workflows/proxy-benchmark.yml +++ b/.github/workflows/proxy-benchmark.yml @@ -60,22 +60,23 @@ jobs: } >> "$GITHUB_ENV" - name: Run proxy-bench - run: ./${PROXY_BENCH_PATH}/run.sh + run: ${PROXY_BENCH_PATH}/run.sh - name: Ingest Bench Results # neon repo script - if: success() + if: always() run: | mkdir -p $TEST_OUTPUT python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT - name: Push Metrics to Proxy perf database - if: success() + if: always() env: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" REPORT_FROM: $TEST_OUTPUT run: $NEON_DIR/scripts/generate_and_push_perf_report.sh - name: Docker cleanup + if: always() run: docker compose down - name: Notify Failure From 2af938096273cdd610cf58973c0a230ed2d894a9 Mon Sep 17 00:00:00 2001 From: Busra Kugler Date: Mon, 30 Jun 2025 12:15:10 +0200 Subject: [PATCH 06/19] Revert "Replace step-security maintained actions" (#12386) Reverts neondatabase/neon#11663 and https://github.com/neondatabase/neon/pull/11265/ Step Security is not yet approved by Databricks team, in order to prevent issues during Github org migration, I'll revert this PR to use the previous action instead of Step Security maintained action. --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/neon_extra_builds.yml | 2 +- .github/workflows/pre-merge-checks.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 94f768719f..456c7b8c92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 3427a0eb49..3e81183687 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 id: files_changed with: token: ${{ github.token }} diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 6fb4753fc0..23b8573097 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -34,7 +34,7 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: python-src with: files: | @@ -45,7 +45,7 @@ jobs: poetry.lock pyproject.toml - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: rust-src with: files: | From 66f53d9d348e2d2c39d5b180798b58befde09f4a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 30 Jun 2025 13:03:48 +0200 Subject: [PATCH 07/19] refactor(pageserver): force explicit mapping to `CreateImageLayersError::Other` (#12382) Implicit mapping to an `anyhow::Error` when we do `?` is discouraged because tooling to find those places isn't great. As a drive-by, also make SplitImageLayerWriter::new infallible and sync. I think we should also make ImageLayerWriter::new completely lazy, then `BatchLayerWriter:new` infallible and async. --- .../storage_layer/batch_split_writer.rs | 35 ++++++++----------- pageserver/src/tenant/timeline.rs | 12 ++++--- pageserver/src/tenant/timeline/compaction.rs | 21 ++++++----- 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 1d50a5f3a0..9c2a177f3a 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -55,11 +55,11 @@ pub struct BatchLayerWriter { } impl BatchLayerWriter { - pub async fn new(conf: &'static PageServerConf) -> anyhow::Result { - Ok(Self { + pub fn new(conf: &'static PageServerConf) -> Self { + Self { generated_layer_writers: Vec::new(), conf, - }) + } } pub fn add_unfinished_image_writer( @@ -209,6 +209,7 @@ impl<'a> SplitImageLayerWriter<'a> { ) -> anyhow::Result { Ok(Self { target_layer_size, + // XXX make this lazy like in SplitDeltaLayerWriter? inner: ImageLayerWriter::new( conf, timeline_id, @@ -223,7 +224,7 @@ impl<'a> SplitImageLayerWriter<'a> { conf, timeline_id, tenant_shard_id, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), lsn, start_key, gate, @@ -319,7 +320,7 @@ pub struct SplitDeltaLayerWriter<'a> { } impl<'a> SplitDeltaLayerWriter<'a> { - pub async fn new( + pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -327,8 +328,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, - ) -> anyhow::Result { - Ok(Self { + ) -> Self { + Self { target_layer_size, inner: None, conf, @@ -336,10 +337,10 @@ impl<'a> SplitDeltaLayerWriter<'a> { tenant_shard_id, lsn_range, last_key_written: Key::MIN, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), gate, cancel, - }) + } } pub async fn put_value( @@ -510,9 +511,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -590,9 +589,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); const N: usize = 2000; for i in 0..N { let i = i as u32; @@ -692,9 +689,7 @@ mod tests { 4 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -770,9 +765,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); for i in 0..N { let i = i as u32; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7261ce783d..08bc6d4a59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -763,7 +763,7 @@ pub(crate) enum CreateImageLayersError { PageReconstructError(#[source] PageReconstructError), #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CreateImageLayersError { @@ -5590,7 +5590,7 @@ impl Timeline { self.should_check_if_image_layers_required(lsn) }; - let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + let mut batch_image_writer = BatchLayerWriter::new(self.conf); let mut all_generated = true; @@ -5694,7 +5694,8 @@ impl Timeline { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -5789,7 +5790,10 @@ impl Timeline { } } - let image_layers = batch_image_writer.finish(self, ctx).await?; + let image_layers = batch_image_writer + .finish(self, ctx) + .await + .map_err(CreateImageLayersError::Other)?; let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 1b8e5f4b9c..02bc4f6bdf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3531,10 +3531,7 @@ impl Timeline { self.get_compaction_target_size(), &self.gate, self.cancel.clone(), - ) - .await - .context("failed to create delta layer writer") - .map_err(CompactionError::Other)?; + ); #[derive(Default)] struct RewritingLayers { @@ -4330,7 +4327,8 @@ impl TimelineAdaptor { self.timeline.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -4339,7 +4337,10 @@ impl TimelineAdaptor { }); let keyspace = KeySpace { - ranges: self.get_keyspace(key_range, lsn, ctx).await?, + ranges: self + .get_keyspace(key_range, lsn, ctx) + .await + .map_err(CreateImageLayersError::Other)?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly let outcome = self @@ -4358,9 +4359,13 @@ impl TimelineAdaptor { unfinished_image_layer, } = outcome { - let (desc, path) = unfinished_image_layer.finish(ctx).await?; + let (desc, path) = unfinished_image_layer + .finish(ctx) + .await + .map_err(CreateImageLayersError::Other)?; let image_layer = - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path) + .map_err(CreateImageLayersError::Other)?; self.new_images.push(image_layer); } From a384d7d501077e6f3cbe32f6ee69454f51f4b2e5 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 14:36:45 +0200 Subject: [PATCH 08/19] pageserver: assert no changes to shard identity (#12379) ## Problem Location config changes can currently result in changes to the shard identity. Such changes will cause data corruption, as seen with #12217. Resolves #12227. Requires #12377. ## Summary of changes Assert that the shard identity does not change on location config updates and on (re)attach. This is currently asserted with `critical!`, in case it misfires in production. Later, we should reject such requests with an error and turn this into a proper assertion. --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 5 +++-- libs/pageserver_api/src/shard.rs | 12 ++++++++++++ pageserver/src/http/routes.rs | 8 ++++++++ pageserver/src/tenant.rs | 4 ++++ pageserver/src/tenant/config.rs | 11 +++++++++++ pageserver/src/tenant/mgr.rs | 20 +++++++++++++++++--- pageserver/src/tenant/secondary.rs | 2 +- 8 files changed, 57 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71e78243a6..9ef0a0ae0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4421,6 +4421,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tracing", "tracing-utils", "utils", ] diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 6dc17b670b..7accbdabca 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -30,12 +30,13 @@ humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true storage_broker.workspace = true -camino = {workspace = true, features = ["serde1"]} +camino = { workspace = true, features = ["serde1"] } remote_storage.workspace = true postgres_backend.workspace = true -nix = {workspace = true, optional = true} +nix = { workspace = true, optional = true } reqwest.workspace = true rand.workspace = true +tracing.workspace = true tracing-utils.workspace = true once_cell.workspace = true diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a9fe3dac43..5a13aace64 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher}; pub use ::utils::shard::*; use postgres_ffi_types::forknum::INIT_FORKNUM; use serde::{Deserialize, Serialize}; +use utils::critical; use crate::key::Key; use crate::models::ShardParameters; @@ -188,6 +189,17 @@ impl ShardIdentity { } } + /// Asserts that the given shard identities are equal. Changes to shard parameters will likely + /// result in data corruption. + pub fn assert_equal(&self, other: ShardIdentity) { + if self != &other { + // TODO: for now, we're conservative and just log errors in production. Turn this into a + // real assertion when we're confident it doesn't misfire, and also reject requests that + // attempt to change it with an error response. + critical!("shard identity mismatch: {self:?} != {other:?}"); + } + } + fn is_broken(&self) -> bool { self.layout == LAYOUT_BROKEN } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f770e420f0..119275f885 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1896,6 +1896,10 @@ async fn update_tenant_config_handler( ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; @@ -1940,6 +1944,10 @@ async fn patch_tenant_config_handler( ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 79bea4eb77..fcb18e8553 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4529,6 +4529,10 @@ impl TenantShard { Ok(toml_edit::de::from_str::(&config)?) } + /// Stores a tenant location config to disk. + /// + /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid + /// changes to shard parameters that may result in data corruption. #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 46cc669400..67df767abd 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -12,6 +12,7 @@ use pageserver_api::models; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::{Deserialize, Serialize}; +use utils::critical; use utils::generation::Generation; #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -171,6 +172,16 @@ impl LocationConf { } } + // This should never happen. + // TODO: turn this into a proper assertion. + if stripe_size != self.shard.stripe_size { + critical!( + "stripe size mismatch: {} != {}", + self.shard.stripe_size, + stripe_size, + ); + } + self.shard.stripe_size = stripe_size; } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 248d92622e..95f5c60170 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -880,6 +880,9 @@ impl TenantManager { // phase of writing config and/or waiting for flush, before returning. match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -914,7 +917,10 @@ impl TenantManager { return Ok(Some(tenant)); } - Some(FastPathModified::Secondary(_secondary_tenant)) => { + Some(FastPathModified::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -948,6 +954,10 @@ impl TenantManager { match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); + // The case where we keep a Tenant alive was covered above in the special case // for Attached->Attached transitions in the same generation. By this point, // if we see an attached tenant we know it will be discarded and should be @@ -981,9 +991,13 @@ impl TenantManager { // rather than assuming it to be empty. spawn_mode = SpawnMode::Eager; } - Some(TenantSlot::Secondary(state)) => { + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); + info!("Shutting down secondary tenant"); - state.shutdown().await; + secondary_tenant.shutdown().await; } Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 2fa0ed9be9..e06788543a 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant { // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However, // storing these enables us to report our full LocationConf, enabling convenient reconciliation // by the control plane (see [`Self::get_location_conf`]) - shard_identity: ShardIdentity, + pub(crate) shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, // Internal state used by the Downloader. From d0a4ae3e8f40c76eb475978aaba4d374525d5ee8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 14:44:17 +0200 Subject: [PATCH 09/19] pageserver: add gRPC LSN lease support (#12384) ## Problem The gRPC API does not provide LSN leases. ## Summary of changes * Add LSN lease support to the gRPC API. * Use gRPC LSN leases for static computes with `grpc://` connstrings. * Move `PageserverProtocol` into the `compute_api::spec` module and reuse it. --- Cargo.lock | 13 +- Cargo.toml | 1 + compute_tools/src/compute.rs | 19 +-- compute_tools/src/lsn_lease.rs | 164 +++++++++++-------- control_plane/src/bin/neon_local.rs | 4 +- control_plane/src/endpoint.rs | 27 +-- libs/compute_api/Cargo.toml | 1 + libs/compute_api/src/spec.rs | 44 +++++ pageserver/page_api/Cargo.toml | 1 + pageserver/page_api/proto/page_service.proto | 20 +++ pageserver/page_api/src/client.rs | 13 ++ pageserver/page_api/src/model.rs | 52 ++++++ pageserver/src/page_service.rs | 31 ++++ storage_controller/Cargo.toml | 1 + storage_controller/src/compute_hook.rs | 3 +- 15 files changed, 281 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ef0a0ae0a..e640e62909 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1279,6 +1279,7 @@ dependencies = [ "remote_storage", "serde", "serde_json", + "url", "utils", ] @@ -4480,6 +4481,7 @@ dependencies = [ "pageserver_api", "postgres_ffi_types", "prost 0.13.5", + "prost-types 0.13.5", "strum", "strum_macros", "thiserror 1.0.69", @@ -5157,7 +5159,7 @@ dependencies = [ "petgraph", "prettyplease", "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "regex", "syn 2.0.100", "tempfile", @@ -5200,9 +5202,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost 0.13.5", ] @@ -6809,6 +6811,7 @@ dependencies = [ "chrono", "clap", "clashmap", + "compute_api", "control_plane", "cron", "diesel", @@ -7642,7 +7645,7 @@ dependencies = [ "prettyplease", "proc-macro2", "prost-build 0.13.3", - "prost-types 0.13.3", + "prost-types 0.13.5", "quote", "syn 2.0.100", ] @@ -7654,7 +7657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1" dependencies = [ "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "tokio", "tokio-stream", "tonic 0.13.1", diff --git a/Cargo.toml b/Cargo.toml index aeb7976b6c..7728f6d8fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,6 +152,7 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13.5" +prost-types = "0.13.5" rand = "0.8" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7566626d57..381f2d45ba 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,4 @@ -use anyhow::{Context, Result, anyhow}; +use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ @@ -6,7 +6,7 @@ use compute_api::responses::{ LfcPrewarmState, TlsConfig, }; use compute_api::spec::{ - ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, }; use futures::StreamExt; use futures::future::join_all; @@ -1003,19 +1003,12 @@ impl ComputeNode { fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); - // Detect the protocol scheme. If the URL doesn't have a scheme, assume libpq. let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); - let scheme = match Url::parse(shard0_connstr) { - Ok(url) => url.scheme().to_lowercase().to_string(), - Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), - Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), - }; - let started = Instant::now(); - let (connected, size) = match scheme.as_str() { - "postgresql" | "postgres" => self.try_get_basebackup_libpq(spec, lsn)?, - "grpc" => self.try_get_basebackup_grpc(spec, lsn)?, - scheme => return Err(anyhow!("unknown URL scheme {scheme}")), + + let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? { + PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, + PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, }; let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 930dbc52b4..3346c18c0d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -4,7 +4,9 @@ use std::thread; use std::time::{Duration, SystemTime}; use anyhow::{Result, bail}; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; +use itertools::Itertools as _; +use pageserver_page_api as page_api; use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; use utils::id::{TenantId, TimelineId}; @@ -76,25 +78,17 @@ fn acquire_lsn_lease_with_retry( loop { // Note: List of pageservers is dynamic, need to re-read configs before each attempt. - let configs = { + let (connstrings, auth) = { let state = compute.state.lock().unwrap(); - let spec = state.pspec.as_ref().expect("spec must be set"); - - let conn_strings = spec.pageserver_connstr.split(','); - - conn_strings - .map(|connstr| { - let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); - if let Some(storage_auth_token) = &spec.storage_auth_token { - config.password(storage_auth_token.clone()); - } - config - }) - .collect::>() + ( + spec.pageserver_connstr.clone(), + spec.storage_auth_token.clone(), + ) }; - let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + let result = + try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn); match result { Ok(Some(res)) => { return Ok(res); @@ -116,68 +110,104 @@ fn acquire_lsn_lease_with_retry( } } -/// Tries to acquire an LSN lease through PS page_service API. +/// Tries to acquire LSN leases on all Pageserver shards. fn try_acquire_lsn_lease( + connstrings: &str, + auth: Option<&str>, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, - configs: &[postgres::Config], ) -> Result> { - fn get_valid_until( - config: &postgres::Config, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - lsn: Lsn, - ) -> Result> { - let mut client = config.connect(NoTls)?; - let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); - let res = client.simple_query(&cmd)?; - let msg = match res.first() { - Some(msg) => msg, - None => bail!("empty response"), - }; - let row = match msg { - SimpleQueryMessage::Row(row) => row, - _ => bail!("error parsing lsn lease response"), + let connstrings = connstrings.split(',').collect_vec(); + let shard_count = connstrings.len(); + let mut leases = Vec::new(); + + for (shard_number, &connstring) in connstrings.iter().enumerate() { + let tenant_shard_id = match shard_count { + 0 | 1 => TenantShardId::unsharded(tenant_id), + shard_count => TenantShardId { + tenant_id, + shard_number: ShardNumber(shard_number as u8), + shard_count: ShardCount::new(shard_count as u8), + }, }; - // Note: this will be None if a lease is explicitly not granted. - let valid_until_str = row.get("valid_until"); - - let valid_until = valid_until_str.map(|s| { - SystemTime::UNIX_EPOCH - .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) - .expect("Time larger than max SystemTime could handle") - }); - Ok(valid_until) + let lease = match PageserverProtocol::from_connstring(connstring)? { + PageserverProtocol::Libpq => { + acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + PageserverProtocol::Grpc => { + acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + }; + leases.push(lease); } - let shard_count = configs.len(); + Ok(leases.into_iter().min().flatten()) +} - let valid_until = if shard_count > 1 { - configs - .iter() - .enumerate() - .map(|(shard_number, config)| { - let tenant_shard_id = TenantShardId { - tenant_id, - shard_count: ShardCount::new(shard_count as u8), - shard_number: ShardNumber(shard_number as u8), - }; - get_valid_until(config, tenant_shard_id, timeline_id, lsn) - }) - .collect::>>>()? - .into_iter() - .min() - .unwrap() - } else { - get_valid_until( - &configs[0], - TenantShardId::unsharded(tenant_id), - timeline_id, - lsn, - )? +/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a +/// postgresql:// scheme. +fn acquire_lsn_lease_libpq( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + let mut config = postgres::Config::from_str(connstring)?; + if let Some(auth) = auth { + config.password(auth); + } + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), }; + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); Ok(valid_until) } + +/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a +/// grpc:// scheme. +fn acquire_lsn_lease_grpc( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + tokio::runtime::Handle::current().block_on(async move { + let mut client = page_api::Client::new( + connstring.to_string(), + tenant_shard_id.tenant_id, + timeline_id, + tenant_shard_id.to_index(), + auth.map(String::from), + None, + ) + .await?; + + let req = page_api::LeaseLsnRequest { lsn }; + match client.lease_lsn(req).await { + Ok(expires) => Ok(Some(expires)), + // Lease couldn't be acquired because the LSN has been garbage collected. + Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None), + Err(err) => Err(err.into()), + } + }) +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c818d07fef..de98d46a55 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,9 +16,9 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::requests::ComputeClaimsScope; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; use control_plane::broker::StorageBroker; -use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol}; +use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index e3faa082db..5ea55b28ef 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -56,8 +56,8 @@ use compute_api::responses::{ TlsConfig, }; use compute_api::spec::{ - Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, - RemoteExtSpec, Role, + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol, + PgIdent, RemoteExtSpec, Role, }; use jsonwebtoken::jwk::{ AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations, @@ -373,29 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode { } } -/// Protocol used to connect to a Pageserver. -#[derive(Clone, Copy, Debug)] -pub enum PageserverProtocol { - Libpq, - Grpc, -} - -impl PageserverProtocol { - /// Returns the URL scheme for the protocol, used in connstrings. - pub fn scheme(&self) -> &'static str { - match self { - Self::Libpq => "postgresql", - Self::Grpc => "grpc", - } - } -} - -impl Display for PageserverProtocol { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.scheme()) - } -} - impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 81b0cd19a1..83cb639f0a 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -12,6 +12,7 @@ jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true +url.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 0e23b70265..508040c5e5 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -4,11 +4,14 @@ //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or //! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; +use std::fmt::Display; +use anyhow::anyhow; use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; +use url::Url; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -429,6 +432,47 @@ pub struct JwksSettings { pub jwt_audience: Option, } +/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. +#[derive(Clone, Copy, Debug, Default)] +pub enum PageserverProtocol { + /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. + #[default] + Libpq, + /// A newer, gRPC-based protocol. Uses grpc:// scheme. + Grpc, +} + +impl PageserverProtocol { + /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given. + /// Errors if the connstring is an invalid URL. + pub fn from_connstring(connstring: &str) -> anyhow::Result { + let scheme = match Url::parse(connstring) { + Ok(url) => url.scheme().to_lowercase(), + Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()), + Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), + }; + match scheme.as_str() { + "postgresql" | "postgres" => Ok(Self::Libpq), + "grpc" => Ok(Self::Grpc), + scheme => Err(anyhow!("invalid protocol scheme: {scheme}")), + } + } + + /// Returns the URL scheme for the protocol, for use in connstrings. + pub fn scheme(&self) -> &'static str { + match self { + Self::Libpq => "postgresql", + Self::Grpc => "grpc", + } + } +} + +impl Display for PageserverProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.scheme()) + } +} + #[cfg(test)] mod tests { use std::fs::File; diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 42ee9b50e9..fbad8cf9d0 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -11,6 +11,7 @@ futures.workspace = true pageserver_api.workspace = true postgres_ffi_types.workspace = true prost.workspace = true +prost-types.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index d06b2cfca5..1d6c230916 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -35,6 +35,8 @@ syntax = "proto3"; package page_api; +import "google/protobuf/timestamp.proto"; + service PageService { // Returns whether a relation exists. rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); @@ -64,6 +66,10 @@ service PageService { // Fetches an SLRU segment. rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); + + // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage + // collect the LSN until the lease expires. Must be acquired on all relevant shards. + rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse); } // The LSN a request should read at. @@ -252,3 +258,17 @@ message GetSlruSegmentRequest { message GetSlruSegmentResponse { bytes segment = 1; } + +// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +// collect the LSN until the lease expires. Must be acquired on all relevant shards. +message LeaseLsnRequest { + // The LSN to lease. Can't be 0 or below the current GC cutoff. + uint64 lsn = 1; +} + +// Lease acquisition response. If the lease could not be granted because the LSN has already been +// garbage collected, a FailedPrecondition status will be returned instead. +message LeaseLsnResponse { + // The lease expiration time. + google.protobuf.Timestamp expires = 1; +} diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 4b456787d2..65e41540b8 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -187,4 +187,17 @@ impl Client { let response = self.client.get_slru_segment(proto_req).await?; Ok(response.into_inner().try_into()?) } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn( + &mut self, + req: model::LeaseLsnRequest, + ) -> Result { + let req = proto::LeaseLsnRequest::from(req); + Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) + } } diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 0493f79781..4497fc6fc7 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -16,6 +16,7 @@ //! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use bytes::Bytes; use postgres_ffi_types::Oid; @@ -703,3 +704,54 @@ impl From for proto::GetSlruSegmentResponse { // SlruKind is defined in pageserver_api::reltag. pub type SlruKind = pageserver_api::reltag::SlruKind; + +/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +/// collect the LSN until the lease expires. +pub struct LeaseLsnRequest { + /// The LSN to lease. + pub lsn: Lsn, +} + +impl TryFrom for LeaseLsnRequest { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnRequest) -> Result { + if pb.lsn == 0 { + return Err(ProtocolError::Missing("lsn")); + } + Ok(Self { lsn: Lsn(pb.lsn) }) + } +} + +impl From for proto::LeaseLsnRequest { + fn from(request: LeaseLsnRequest) -> Self { + Self { lsn: request.lsn.0 } + } +} + +/// Lease expiration time. If the lease could not be granted because the LSN has already been +/// garbage collected, a FailedPrecondition status will be returned instead. +pub type LeaseLsnResponse = SystemTime; + +impl TryFrom for LeaseLsnResponse { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnResponse) -> Result { + let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?; + UNIX_EPOCH + .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32)) + .ok_or_else(|| ProtocolError::invalid("expires", expires)) + } +} + +impl From for proto::LeaseLsnResponse { + fn from(response: LeaseLsnResponse) -> Self { + let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default(); + Self { + expires: Some(prost_types::Timestamp { + seconds: expires.as_secs() as i64, + nanos: expires.subsec_nanos() as i32, + }), + } + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c04f6e2b47..1d824ac846 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -14,6 +14,7 @@ use std::{io, str}; use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; +use chrono::Utc; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; use itertools::Itertools; @@ -3760,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler { let resp: page_api::GetSlruSegmentResponse = resp.segment; Ok(tonic::Response::new(resp.into())) } + + #[instrument(skip_all, fields(lsn))] + async fn lease_lsn( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + let timeline = self.get_request_timeline(&req).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + + // Validate and convert the request, and decorate the span. + let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?; + + span_record!(lsn=%req.lsn); + + // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted. + let lease_length = timeline.get_lsn_lease_length(); + let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) { + Ok(lease) => lease.valid_until, + Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))), + }; + + // TODO: is this spammy? Move it compute-side? + info!( + "acquired lease for {} until {}", + req.lsn, + chrono::DateTime::::from(expires).to_rfc3339() + ); + + Ok(tonic::Response::new(expires.into())) + } } /// gRPC middleware layer that handles observability concerns: diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 3a0806b3b2..143f4241f4 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -20,6 +20,7 @@ camino.workspace = true chrono.workspace = true clap.workspace = true clashmap.workspace = true +compute_api.workspace = true cron.workspace = true fail.workspace = true futures.workspace = true diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 0b5569b3d6..4f0837548f 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -5,7 +5,8 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus, PageserverProtocol}; +use compute_api::spec::PageserverProtocol; +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; From 8e216a3a59f23b45db635eb9a97a76e71c4f4e3b Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Mon, 30 Jun 2025 18:09:50 +0400 Subject: [PATCH 10/19] storcon: notify cplane on safekeeper membership change (#12390) ## Problem We don't notify cplane about safekeeper membership change yet. Without the notification the compute needs to know all the safekeepers on the cluster to be able to speak to them. Change notifications will allow to avoid it. - Closes: https://github.com/neondatabase/neon/issues/12188 ## Summary of changes - Implement `notify_safekeepers` method in `ComputeHook` - Notify cplane about safekeepers in `safekeeper_migrate` handler. - Update the test to make sure notifications work. ## Out of scope - There is `cplane_notified_generation` field in `timelines` table in strocon's database. It's not needed now, so it's not updated in the PR. Probably we can remove it. - e2e tests to make sure it works with a production cplane --- control_plane/src/bin/neon_local.rs | 4 +- control_plane/src/endpoint.rs | 39 +- storage_controller/src/compute_hook.rs | 479 +++++++++++++----- storage_controller/src/reconciler.rs | 4 +- storage_controller/src/service.rs | 8 +- .../src/service/safekeeper_service.rs | 56 +- .../regress/test_safekeeper_migration.py | 22 +- 7 files changed, 456 insertions(+), 156 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index de98d46a55..3440d8979a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1649,7 +1649,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = parse_safekeepers(&args.safekeepers)?; - endpoint.reconfigure(pageservers, None, safekeepers).await?; + endpoint + .reconfigure(Some(pageservers), None, safekeepers, None) + .await?; } EndpointCmd::Stop(args) => { let endpoint_id = &args.endpoint_id; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5ea55b28ef..e6fe7d90a2 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -974,12 +974,11 @@ impl Endpoint { pub async fn reconfigure( &self, - pageservers: Vec<(PageserverProtocol, Host, u16)>, + pageservers: Option>, stripe_size: Option, safekeepers: Option>, + safekeeper_generation: Option, ) -> Result<()> { - anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); - let (mut spec, compute_ctl_config) = { let config_path = self.endpoint_path().join("config.json"); let file = std::fs::File::open(config_path)?; @@ -991,16 +990,24 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); - spec.pageserver_connstring = Some(pageserver_connstr); - if stripe_size.is_some() { - spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + // If pageservers are not specified, don't change them. + if let Some(pageservers) = pageservers { + anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); + + let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); + spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } } // If safekeepers are not specified, don't change them. if let Some(safekeepers) = safekeepers { let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; spec.safekeeper_connstrings = safekeeper_connstrings; + if let Some(g) = safekeeper_generation { + spec.safekeepers_generation = Some(g.into_inner()); + } } let client = reqwest::Client::builder() @@ -1038,6 +1045,24 @@ impl Endpoint { } } + pub async fn reconfigure_pageservers( + &self, + pageservers: Vec<(PageserverProtocol, Host, u16)>, + stripe_size: Option, + ) -> Result<()> { + self.reconfigure(Some(pageservers), stripe_size, None, None) + .await + } + + pub async fn reconfigure_safekeepers( + &self, + safekeepers: Vec, + generation: SafekeeperGeneration, + ) -> Result<()> { + self.reconfigure(None, None, Some(safekeepers), Some(generation)) + .await + } + pub async fn stop( &self, mode: EndpointTerminateMode, diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 4f0837548f..ab37a207e4 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -14,11 +14,12 @@ use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; use utils::backoff::{self}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::service::Config; @@ -36,7 +37,7 @@ struct UnshardedComputeHookTenant { preferred_az: Option, // Must hold this lock to send a notification. - send_lock: Arc>>, + send_lock: Arc>>, } struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, @@ -49,7 +50,7 @@ struct ShardedComputeHookTenant { // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. - send_lock: Arc>>, + send_lock: Arc>>, } /// Represents our knowledge of the compute's state: we can update this when we get a @@ -57,9 +58,9 @@ struct ShardedComputeHookTenant { /// /// Should be wrapped in an Option<>, as we cannot always know the remote state. #[derive(PartialEq, Eq, Debug)] -struct ComputeRemoteState { +struct ComputeRemoteState { // The request body which was acked by the compute - request: ComputeHookNotifyRequest, + request: R, // Whether the cplane indicated that the state was applied to running computes, or just // persisted. In the Neon control plane, this is the difference between a 423 response (meaning @@ -67,6 +68,36 @@ struct ComputeRemoteState { applied: bool, } +type ComputeRemoteTenantState = ComputeRemoteState; +type ComputeRemoteTimelineState = ComputeRemoteState; + +/// The trait which define the handler-specific types and methods. +/// We have two implementations of this trait so far: +/// - [`ComputeHookTenant`] for tenant attach notifications ("/notify-attach") +/// - [`ComputeHookTimeline`] for safekeeper change notifications ("/notify-safekeepers") +trait ApiMethod { + /// Type of the key which identifies the resource. + /// It's either TenantId for tenant attach notifications, + /// or TenantTimelineId for safekeeper change notifications. + type Key: std::cmp::Eq + std::hash::Hash + Clone; + + type Request: serde::Serialize + std::fmt::Debug; + + const API_PATH: &'static str; + + fn maybe_send( + &self, + key: Self::Key, + lock: Option>>>, + ) -> MaybeSendResult; + + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &Self::Request, + ) -> Result<(), NotifyError>; +} + enum ComputeHookTenant { Unsharded(UnshardedComputeHookTenant), Sharded(ShardedComputeHookTenant), @@ -97,7 +128,7 @@ impl ComputeHookTenant { } } - fn get_send_lock(&self) -> &Arc>> { + fn get_send_lock(&self) -> &Arc>> { match self { Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, @@ -191,19 +222,136 @@ impl ComputeHookTenant { } } +/// The state of a timeline we need to notify the compute about. +struct ComputeHookTimeline { + generation: SafekeeperGeneration, + safekeepers: Vec, + + send_lock: Arc>>, +} + +impl ComputeHookTimeline { + /// Construct a new ComputeHookTimeline with the given safekeepers and generation. + fn new(generation: SafekeeperGeneration, safekeepers: Vec) -> Self { + Self { + generation, + safekeepers, + send_lock: Arc::default(), + } + } + + /// Update the state with a new SafekeepersUpdate. + /// Noop if the update generation is not greater than the current generation. + fn update(&mut self, sk_update: SafekeepersUpdate) { + if sk_update.generation > self.generation { + self.generation = sk_update.generation; + self.safekeepers = sk_update.safekeepers; + } + } +} + +impl ApiMethod for ComputeHookTimeline { + type Key = TenantTimelineId; + type Request = NotifySafekeepersRequest; + + const API_PATH: &'static str = "notify-safekeepers"; + + fn maybe_send( + &self, + ttid: TenantTimelineId, + lock: Option>>, + ) -> MaybeSendNotifySafekeepersResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::timelines`] lock. + let Ok(locked) = self.send_lock.clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock((ttid, self.send_lock.clone())); + }; + locked + } + }; + + if locked + .as_ref() + .is_some_and(|s| s.request.generation >= self.generation) + { + return MaybeSendResult::Noop; + } + + MaybeSendResult::Transmit(( + NotifySafekeepersRequest { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + generation: self.generation, + safekeepers: self.safekeepers.clone(), + }, + locked, + )) + } + + async fn notify_local( + _env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifySafekeepersRequest, + ) -> Result<(), NotifyError> { + let NotifySafekeepersRequest { + tenant_id, + timeline_id, + generation, + safekeepers, + } = req; + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id + && endpoint.timeline_id == *timeline_id + && endpoint.status() == EndpointStatus::Running + { + tracing::info!("Reconfiguring safekeepers for endpoint {endpoint_name}"); + + let safekeepers = safekeepers.iter().map(|sk| sk.id).collect::>(); + + endpoint + .reconfigure_safekeepers(safekeepers, *generation) + .await + .map_err(NotifyError::NeonLocal)?; + } + } + + Ok(()) + } +} + #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequestShard { +struct NotifyAttachRequestShard { node_id: NodeId, shard_number: ShardNumber, } /// Request body that we send to the control plane to notify it of where a tenant is attached #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequest { +struct NotifyAttachRequest { tenant_id: TenantId, preferred_az: Option, stripe_size: Option, - shards: Vec, + shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] +pub(crate) struct SafekeeperInfo { + pub id: NodeId, + /// Hostname of the safekeeper. + /// It exists for better debuggability. Might be missing. + /// Should not be used for anything else. + pub hostname: Option, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct NotifySafekeepersRequest { + tenant_id: TenantId, + timeline_id: TimelineId, + generation: SafekeeperGeneration, + safekeepers: Vec, } /// Error type for attempts to call into the control plane compute notification hook @@ -235,42 +383,50 @@ pub(crate) enum NotifyError { NeonLocal(anyhow::Error), } -enum MaybeSendResult { +enum MaybeSendResult { // Please send this request while holding the lock, and if you succeed then write // the request into the lock. Transmit( ( - ComputeHookNotifyRequest, - tokio::sync::OwnedMutexGuard>, + R, + tokio::sync::OwnedMutexGuard>>, ), ), // Something requires sending, but you must wait for a current sender then call again - AwaitLock(Arc>>), + AwaitLock((K, Arc>>>)), // Nothing requires sending Noop, } -impl ComputeHookTenant { +type MaybeSendNotifyAttachResult = MaybeSendResult; +type MaybeSendNotifySafekeepersResult = MaybeSendResult; + +impl ApiMethod for ComputeHookTenant { + type Key = TenantId; + type Request = NotifyAttachRequest; + + const API_PATH: &'static str = "notify-attach"; + fn maybe_send( &self, tenant_id: TenantId, - lock: Option>>, - ) -> MaybeSendResult { + lock: Option>>, + ) -> MaybeSendNotifyAttachResult { let locked = match lock { Some(already_locked) => already_locked, None => { - // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::tenants`] lock. let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { - return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + return MaybeSendResult::AwaitLock((tenant_id, self.get_send_lock().clone())); }; locked } }; let request = match self { - Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + Self::Unsharded(unsharded_tenant) => Some(NotifyAttachRequest { tenant_id, - shards: vec![ComputeHookNotifyRequestShard { + shards: vec![NotifyAttachRequestShard { shard_number: ShardNumber(0), node_id: unsharded_tenant.node_id, }], @@ -283,12 +439,12 @@ impl ComputeHookTenant { Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => { - Some(ComputeHookNotifyRequest { + Some(NotifyAttachRequest { tenant_id, shards: sharded_tenant .shards .iter() - .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + .map(|(shard_number, node_id)| NotifyAttachRequestShard { shard_number: *shard_number, node_id: *node_id, }) @@ -333,98 +489,22 @@ impl ComputeHookTenant { } } } -} -/// The compute hook is a destination for notifications about changes to tenant:pageserver -/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures -/// the compute connection string. -pub(super) struct ComputeHook { - config: Config, - state: std::sync::Mutex>, - authorization_header: Option, - - // Concurrency limiter, so that we do not overload the cloud control plane when updating - // large numbers of tenants (e.g. when failing over after a node failure) - api_concurrency: tokio::sync::Semaphore, - - // This lock is only used in testing enviroments, to serialize calls into neon_lock - neon_local_lock: tokio::sync::Mutex<()>, - - // We share a client across all notifications to enable connection re-use etc when - // sending large numbers of notifications - client: reqwest::Client, -} - -/// Callers may give us a list of these when asking us to send a bulk batch -/// of notifications in the background. This is a 'notification' in the sense of -/// other code notifying us of a shard's status, rather than being the final notification -/// that we send upwards to the control plane for the whole tenant. -pub(crate) struct ShardUpdate<'a> { - pub(crate) tenant_shard_id: TenantShardId, - pub(crate) node_id: NodeId, - pub(crate) stripe_size: ShardStripeSize, - pub(crate) preferred_az: Option>, -} - -impl ComputeHook { - pub(super) fn new(config: Config) -> anyhow::Result { - let authorization_header = config - .control_plane_jwt_token - .clone() - .map(|jwt| format!("Bearer {jwt}")); - - let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); - for cert in &config.ssl_ca_certs { - client = client.add_root_certificate(cert.clone()); - } - let client = client - .build() - .context("Failed to build http client for compute hook")?; - - Ok(Self { - state: Default::default(), - config, - authorization_header, - neon_local_lock: Default::default(), - api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), - client, - }) - } - - /// For test environments: use neon_local's LocalEnv to update compute - async fn do_notify_local( - &self, - reconfigure_request: &ComputeHookNotifyRequest, + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifyAttachRequest, ) -> Result<(), NotifyError> { - // neon_local updates are not safe to call concurrently, use a lock to serialize - // all calls to this function - let _locked = self.neon_local_lock.lock().await; - - let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { - tracing::warn!( - "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" - ); - return Ok(()); - }; - let env = match LocalEnv::load_config(repo_dir) { - Ok(e) => e, - Err(e) => { - tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); - return Ok(()); - } - }; - let cplane = - ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); - let ComputeHookNotifyRequest { + let NotifyAttachRequest { tenant_id, shards, stripe_size, preferred_az: _preferred_az, - } = reconfigure_request; + } = req; for (endpoint_name, endpoint) in &cplane.endpoints { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { - tracing::info!("Reconfiguring endpoint {endpoint_name}"); + tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}"); let pageservers = shards .iter() @@ -446,7 +526,7 @@ impl ComputeHook { .collect::>(); endpoint - .reconfigure(pageservers, *stripe_size, None) + .reconfigure_pageservers(pageservers, *stripe_size) .await .map_err(NotifyError::NeonLocal)?; } @@ -454,11 +534,102 @@ impl ComputeHook { Ok(()) } +} - async fn do_notify_iteration( +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + tenants: std::sync::Mutex>, + timelines: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_local + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + +pub(crate) struct SafekeepersUpdate { + pub(crate) tenant_id: TenantId, + pub(crate) timeline_id: TimelineId, + pub(crate) generation: SafekeeperGeneration, + pub(crate) safekeepers: Vec, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> anyhow::Result { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {jwt}")); + + let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); + for cert in &config.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client + .build() + .context("Failed to build http client for compute hook")?; + + Ok(Self { + tenants: Default::default(), + timelines: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + }) + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local(&self, req: &M::Request) -> Result<(), NotifyError> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { + tracing::warn!( + "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" + ); + return Ok(()); + }; + let env = match LocalEnv::load_config(repo_dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + + M::notify_local(&env, &cplane, req).await + } + + async fn do_notify_iteration( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &Req, cancel: &CancellationToken, ) -> Result<(), NotifyError> { let req = self.client.request(reqwest::Method::PUT, url); @@ -480,9 +651,7 @@ impl ComputeHook { }; // Treat all 2xx responses as success - if response.status() >= reqwest::StatusCode::OK - && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES - { + if response.status().is_success() { if response.status() != reqwest::StatusCode::OK { // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so // log a warning. @@ -533,10 +702,10 @@ impl ComputeHook { } } - async fn do_notify( + async fn do_notify( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &R, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // We hold these semaphore units across all retries, rather than only across each @@ -568,13 +737,13 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { - let mut state_locked = self.state.lock().unwrap(); + fn notify_attach_prepare(&self, shard_update: ShardUpdate) -> MaybeSendNotifyAttachResult { + let mut tenants_locked = self.tenants.lock().unwrap(); use std::collections::hash_map::Entry; let tenant_shard_id = shard_update.tenant_shard_id; - let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + let tenant = match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(e) => { let ShardUpdate { tenant_shard_id, @@ -598,10 +767,37 @@ impl ComputeHook { tenant.maybe_send(tenant_shard_id.tenant_id, None) } - async fn notify_execute( + fn notify_safekeepers_prepare( &self, - maybe_send_result: MaybeSendResult, - tenant_shard_id: TenantShardId, + safekeepers_update: SafekeepersUpdate, + ) -> MaybeSendNotifySafekeepersResult { + let mut timelines_locked = self.timelines.lock().unwrap(); + + let ttid = TenantTimelineId { + tenant_id: safekeepers_update.tenant_id, + timeline_id: safekeepers_update.timeline_id, + }; + + use std::collections::hash_map::Entry; + let timeline = match timelines_locked.entry(ttid) { + Entry::Vacant(e) => e.insert(ComputeHookTimeline::new( + safekeepers_update.generation, + safekeepers_update.safekeepers, + )), + Entry::Occupied(e) => { + let timeline = e.into_mut(); + timeline.update(safekeepers_update); + timeline + } + }; + + timeline.maybe_send(ttid, None) + } + + async fn notify_execute( + &self, + state: &std::sync::Mutex>, + maybe_send_result: MaybeSendResult, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // Process result: we may get an update to send, or we may have to wait for a lock @@ -610,7 +806,7 @@ impl ComputeHook { MaybeSendResult::Noop => { return Ok(()); } - MaybeSendResult::AwaitLock(send_lock) => { + MaybeSendResult::AwaitLock((key, send_lock)) => { let send_locked = tokio::select! { guard = send_lock.lock_owned() => {guard}, _ = cancel.cancelled() => { @@ -621,11 +817,11 @@ impl ComputeHook { // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses // try_lock. - let state_locked = self.state.lock().unwrap(); - let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + let state_locked = state.lock().unwrap(); + let Some(resource_state) = state_locked.get(&key) else { return Ok(()); }; - match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + match resource_state.maybe_send(key, Some(send_locked)) { MaybeSendResult::AwaitLock(_) => { unreachable!("We supplied lock guard") } @@ -644,14 +840,18 @@ impl ComputeHook { .control_plane_url .as_ref() .map(|control_plane_url| { - format!("{}/notify-attach", control_plane_url.trim_end_matches('/')) + format!( + "{}/{}", + control_plane_url.trim_end_matches('/'), + M::API_PATH + ) }); // We validate this at startup let notify_url = compute_hook_url.as_ref().unwrap(); self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local(&request).await.map_err(|e| { + self.do_notify_local::(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. tracing::error!("neon_local notification hook failed: {e}"); NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) @@ -687,7 +887,7 @@ impl ComputeHook { /// Infallible synchronous fire-and-forget version of notify(), that sends its results to /// a channel. Something should consume the channel and arrange to try notifying again /// if something failed. - pub(super) fn notify_background( + pub(super) fn notify_attach_background( self: &Arc, notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, @@ -696,7 +896,7 @@ impl ComputeHook { let mut maybe_sends = Vec::new(); for shard_update in notifications { let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); + let maybe_send_result = self.notify_attach_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -715,10 +915,10 @@ impl ComputeHook { async move { this - .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .notify_execute(&this.tenants, maybe_send_result, &cancel) .await.map_err(|e| (tenant_shard_id, e)) }.instrument(info_span!( - "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + "notify_attach_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() )) }) .buffered(API_CONCURRENCY); @@ -761,14 +961,23 @@ impl ComputeHook { /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify<'a>( + pub(super) async fn notify_attach<'a>( &self, shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); - self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + let maybe_send_result = self.notify_attach_prepare(shard_update); + self.notify_execute(&self.tenants, maybe_send_result, cancel) + .await + } + + pub(super) async fn notify_safekeepers( + &self, + safekeepers_update: SafekeepersUpdate, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_safekeepers_prepare(safekeepers_update); + self.notify_execute(&self.timelines, maybe_send_result, cancel) .await } @@ -784,8 +993,8 @@ impl ComputeHook { ) { use std::collections::hash_map::Entry; - let mut state_locked = self.state.lock().unwrap(); - match state_locked.entry(tenant_shard_id.tenant_id) { + let mut tenants_locked = self.tenants.lock().unwrap(); + match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { // This is a valid but niche case, where the tenant was previously attached // as a Secondary location and then detached, so has no previously notified diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 92844c9c7b..a2fba0fa56 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -65,7 +65,7 @@ pub(super) struct Reconciler { pub(crate) compute_hook: Arc, /// To avoid stalling if the cloud control plane is unavailable, we may proceed - /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// past failures in [`ComputeHook::notify_attach`], but we _must_ remember that we failed /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, @@ -1023,7 +1023,7 @@ impl Reconciler { if let Some(node) = &self.intent.attached { let result = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: self.tenant_shard_id, node_id: node.get_id(), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e0b13c4e63..e4c494db8f 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -878,18 +878,18 @@ impl Service { // Emit compute hook notifications for all tenants which are already stably attached. Other tenants // will emit compute hook notifications when they reconcile. // - // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // Ordering: our calls to notify_attach_background synchronously establish a relative order for these notifications vs. any later // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later // calls will be correctly ordered wrt these. // - // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // Concurrency: we call notify_attach_background for all tenants, which will create O(N) tokio tasks, but almost all of them // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore // unit and start doing I/O. tracing::info!( "Sending {} compute notifications", compute_notifications.len() ); - self.compute_hook.notify_background( + self.compute_hook.notify_attach_background( compute_notifications, bg_compute_notify_result_tx.clone(), &self.cancel, @@ -6281,7 +6281,7 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: child_id, node_id: child_ps, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fc33a24198..cf48b007b2 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; +use crate::compute_hook; use crate::heartbeater::SafekeeperState; use crate::id_lock_map::trace_shared_lock; use crate::metrics; @@ -1198,7 +1199,11 @@ impl Service { // 4. Call PUT configuration on safekeepers from the current set, // delivering them joint_conf. - // TODO(diko): need to notify cplane with an updated set of safekeepers. + // Notify cplane/compute about the membership change BEFORE changing the membership on safekeepers. + // This way the compute will know about new safekeepers from joint_config before we require to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &joint_config) + .await?; let results = self .tenant_timeline_set_membership_quorum( @@ -1305,8 +1310,55 @@ impl Service { ) .await?; - // TODO(diko): need to notify cplane with an updated set of safekeepers. + // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers. + // This way the compute will stop talking to excluded safekeepers only after we stop requiring to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf) + .await?; Ok(()) } + + /// Notify cplane about safekeeper membership change. + /// The cplane will receive a joint set of safekeepers as a safekeeper list. + async fn cplane_notify_safekeepers( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + mconf: &membership::Configuration, + ) -> Result<(), ApiError> { + let mut safekeepers = Vec::new(); + let mut ids: HashSet<_> = HashSet::new(); + + for member in mconf + .members + .m + .iter() + .chain(mconf.new_members.iter().flat_map(|m| m.m.iter())) + { + if ids.insert(member.id) { + safekeepers.push(compute_hook::SafekeeperInfo { + id: member.id, + hostname: Some(member.host.clone()), + }); + } + } + + self.compute_hook + .notify_safekeepers( + compute_hook::SafekeepersUpdate { + tenant_id, + timeline_id, + generation: mconf.generation, + safekeepers, + }, + &self.cancel, + ) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "failed to notify cplane about safekeeper membership change: {err}" + )) + }) + } } diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index f67b6afc95..057371175c 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -32,10 +32,13 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): ) ep = env.endpoints.create("main", tenant_id=env.initial_tenant) - # We specify all safekeepers, so compute will connect to all of them. - # Only those from the current membership configuration will be used. - # TODO(diko): set only current safekeepers when cplane notify is implemented. - ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] is None + assert len(mconf["sk_set"]) == 1 + assert mconf["generation"] == 1 + + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) ep.safe_psql("CREATE EXTENSION neon_test_utils;") ep.safe_psql("CREATE TABLE t(a int)") @@ -58,7 +61,16 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + # 1 initial generation + 2 migrations on each loop iteration. + expected_gen = 1 + 2 * 3 + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == expected_gen + + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{expected_gen}:") + + # Restart and check again to make sure data is persistent. ep.stop() - ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.start(safekeeper_generation=1, safekeepers=[3]) assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] From 2e681e0ef8dce0ed5b4943361d6e2c2f63b13928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 30 Jun 2025 23:36:15 +0200 Subject: [PATCH 11/19] detach_ancestor: delete the right layer when hardlink fails (#12397) If a hardlink operation inside `detach_ancestor` fails due to the layer already existing, we delete the layer to make sure the source is one we know about, and then retry. But we deleted the wrong file, namely, the one we wanted to use as the source of the hardlink. As a result, the follow up hard link operation failed. Our PR corrects this mistake. --- pageserver/src/tenant/timeline/detach_ancestor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index f47ce5408b..f20a1343df 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -885,7 +885,7 @@ async fn remote_copy( } tracing::info!("Deleting orphan layer file to make way for hard linking"); // Delete orphan layer file and try again, to ensure this layer has a well understood source - std::fs::remove_file(adopted_path) + std::fs::remove_file(&adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; std::fs::hard_link(adopted_path, &adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; From 5f3532970e769ea57ffb3fa8ee538b41ebea9b39 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Mon, 30 Jun 2025 18:12:37 -0400 Subject: [PATCH 12/19] [compute] fix: background worker that collects installed extension metrics now updates collection interval (#12277) ## Problem Previously, the background worker that collects the list of installed extensions across DBs had a timeout set to 1 hour. This cause a problem with computes that had a `suspend_timeout` > 1 hour as this collection was treated as activity, preventing compute shutdown. Issue: https://github.com/neondatabase/cloud/issues/30147 ## Summary of changes Passing the `suspend_timeout` as part of the `ComputeSpec` so that any updates to this are taken into account by the background worker and updates its collection interval. --- compute_tools/src/bin/compute_ctl.rs | 6 +- compute_tools/src/compute.rs | 72 +++++++++++++++++-- compute_tools/tests/cluster_spec.json | 3 +- control_plane/src/endpoint.rs | 1 + .../var/db/postgres/configs/config.json | 1 + libs/compute_api/src/spec.rs | 5 ++ libs/compute_api/tests/cluster_spec.json | 1 + 7 files changed, 82 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index d7ff381f1b..db7746b8eb 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -36,6 +36,8 @@ use std::ffi::OsString; use std::fs::File; use std::process::exit; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::sync::mpsc; use std::thread; use std::time::Duration; @@ -190,7 +192,9 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - installed_extensions_collection_interval: cli.installed_extensions_collection_interval, + installed_extensions_collection_interval: Arc::new(AtomicU64::new( + cli.installed_extensions_collection_interval, + )), }, config, )?; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 381f2d45ba..9dcd4fc17c 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -25,7 +25,7 @@ use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; @@ -70,6 +70,7 @@ pub static BUILD_TAG: Lazy = Lazy::new(|| { .unwrap_or(BUILD_TAG_DEFAULT) .to_string() }); +const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600; /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. @@ -103,7 +104,7 @@ pub struct ComputeNodeParams { pub remote_ext_base_url: Option, /// Interval for installed extensions collection - pub installed_extensions_collection_interval: u64, + pub installed_extensions_collection_interval: Arc, } /// Compute node info shared across several `compute_ctl` threads. @@ -126,6 +127,9 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub compute_ctl_config: ComputeCtlConfig, + + /// Handle to the extension stats collection task + extension_stats_task: Mutex>>, } // store some metrics about download size that might impact startup time @@ -428,6 +432,7 @@ impl ComputeNode { state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), compute_ctl_config: config.compute_ctl_config, + extension_stats_task: Mutex::new(None), }) } @@ -515,6 +520,9 @@ impl ComputeNode { None }; + // Terminate the extension stats collection task + this.terminate_extension_stats_task(); + // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -1671,6 +1679,8 @@ impl ComputeNode { tls_config = self.compute_ctl_config.tls.clone(); } + self.update_installed_extensions_collection_interval(&spec); + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. @@ -1735,6 +1745,8 @@ impl ComputeNode { let tls_config = self.tls_config(&spec); + self.update_installed_extensions_collection_interval(&spec); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); @@ -2339,10 +2351,20 @@ LIMIT 100", } pub fn spawn_extension_stats_task(&self) { + // Cancel any existing task + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + let conf = self.tokio_conn_conf.clone(); - let installed_extensions_collection_interval = - self.params.installed_extensions_collection_interval; - tokio::spawn(async move { + let atomic_interval = self.params.installed_extensions_collection_interval.clone(); + let mut installed_extensions_collection_interval = + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst); + info!( + "[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}", + installed_extensions_collection_interval + ); + let handle = tokio::spawn(async move { // An initial sleep is added to ensure that two collections don't happen at the same time. // The first collection happens during compute startup. tokio::time::sleep(tokio::time::Duration::from_secs( @@ -2355,8 +2377,48 @@ LIMIT 100", loop { interval.tick().await; let _ = installed_extensions(conf.clone()).await; + // Acquire a read lock on the compute spec and then update the interval if necessary + interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max( + installed_extensions_collection_interval, + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst), + ))); + installed_extensions_collection_interval = interval.period().as_secs(); } }); + + // Store the new task handle + *self.extension_stats_task.lock().unwrap() = Some(handle); + } + + fn terminate_extension_stats_task(&self) { + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + } + + fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) { + // Update the interval for collecting installed extensions statistics + // If the value is -1, we never suspend so set the value to default collection. + // If the value is 0, it means default, we will just continue to use the default. + if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", + spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL + ); + self.params.installed_extensions_collection_interval.store( + DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, + std::sync::atomic::Ordering::SeqCst, + ); + } else { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}", + spec.suspend_timeout_seconds + ); + self.params.installed_extensions_collection_interval.store( + spec.suspend_timeout_seconds as u64, + std::sync::atomic::Ordering::SeqCst, + ); + } } } diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 5655a94de4..439d80c057 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -3,7 +3,8 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", - + "suspend_timeout_seconds": 3600, + "cluster": { "cluster_id": "test-cluster-42", "name": "Zenith Test", diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index e6fe7d90a2..424101b9a4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -780,6 +780,7 @@ impl Endpoint { endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), autoprewarm: false, + suspend_timeout_seconds: -1, // Only used in neon_local. }; // this strange code is needed to support respec() in tests diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json index 21caf3800c..60e232425b 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json +++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json @@ -4,6 +4,7 @@ "timestamp": "2022-10-12T18:00:00.000Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + "suspend_timeout_seconds": -1, "cluster": { "cluster_id": "docker_compose", diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 508040c5e5..6b2caa9d3a 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -184,6 +184,11 @@ pub struct ComputeSpec { /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] pub autoprewarm: bool, + + /// Suspend timeout in seconds. + /// + /// We use this value to derive other values, such as the installed extensions metric. + pub suspend_timeout_seconds: i64, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 2dd2aae015..94d7f1e081 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -3,6 +3,7 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + "suspend_timeout_seconds": 3600, "cluster": { "cluster_id": "test-cluster-42", From daa402f35ae8230efb854d732c5f96be1605fb62 Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Tue, 1 Jul 2025 02:53:11 -0700 Subject: [PATCH 13/19] pageserver: Make ImageLayerWriter sync, infallible and lazy (#12403) ## Problem ## Summary of changes Make ImageLayerWriter sync, infallible and lazy. Address https://github.com/neondatabase/neon/issues/12389. All unit tests passed. --- .../storage_layer/batch_split_writer.rs | 72 +++++++++---------- pageserver/src/tenant/timeline/compaction.rs | 26 +++---- 2 files changed, 46 insertions(+), 52 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 9c2a177f3a..7f0ca5b337 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -182,7 +182,7 @@ impl BatchLayerWriter { /// An image writer that takes images and produces multiple image layers. #[must_use] pub struct SplitImageLayerWriter<'a> { - inner: ImageLayerWriter, + inner: Option, target_layer_size: u64, lsn: Lsn, conf: &'static PageServerConf, @@ -196,7 +196,7 @@ pub struct SplitImageLayerWriter<'a> { impl<'a> SplitImageLayerWriter<'a> { #[allow(clippy::too_many_arguments)] - pub async fn new( + pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -205,22 +205,10 @@ impl<'a> SplitImageLayerWriter<'a> { target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, - ctx: &RequestContext, - ) -> anyhow::Result { - Ok(Self { + ) -> Self { + Self { target_layer_size, - // XXX make this lazy like in SplitDeltaLayerWriter? - inner: ImageLayerWriter::new( - conf, - timeline_id, - tenant_shard_id, - &(start_key..Key::MAX), - lsn, - gate, - cancel.clone(), - ctx, - ) - .await?, + inner: None, conf, timeline_id, tenant_shard_id, @@ -229,7 +217,7 @@ impl<'a> SplitImageLayerWriter<'a> { start_key, gate, cancel, - }) + } } pub async fn put_image( @@ -238,12 +226,31 @@ impl<'a> SplitImageLayerWriter<'a> { img: Bytes, ctx: &RequestContext, ) -> Result<(), PutError> { + if self.inner.is_none() { + self.inner = Some( + ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(self.start_key..Key::MAX), + self.lsn, + self.gate, + self.cancel.clone(), + ctx, + ) + .await + .map_err(PutError::Other)?, + ); + } + + let inner = self.inner.as_mut().unwrap(); + // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; - if self.inner.num_keys() >= 1 - && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + if inner.num_keys() >= 1 + && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { let next_image_writer = ImageLayerWriter::new( self.conf, @@ -257,7 +264,7 @@ impl<'a> SplitImageLayerWriter<'a> { ) .await .map_err(PutError::Other)?; - let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); + let prev_image_writer = std::mem::replace(inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, self.start_key..key, @@ -265,7 +272,7 @@ impl<'a> SplitImageLayerWriter<'a> { ); self.start_key = key; } - self.inner.put_image(key, img, ctx).await + inner.put_image(key, img, ctx).await } pub(crate) async fn finish_with_discard_fn( @@ -282,8 +289,10 @@ impl<'a> SplitImageLayerWriter<'a> { let Self { mut batches, inner, .. } = self; - if inner.num_keys() != 0 { - batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); + if let Some(inner) = inner { + if inner.num_keys() != 0 { + batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); + } } batches.finish_with_discard_fn(tline, ctx, discard_fn).await } @@ -498,10 +507,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, @@ -577,10 +583,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, @@ -676,10 +679,7 @@ mod tests { 4 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 02bc4f6bdf..04852fb721 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3503,22 +3503,16 @@ impl Timeline { // Only create image layers when there is no ancestor branches. TODO: create covering image layer // when some condition meet. let mut image_layer_writer = if !has_data_below { - Some( - SplitImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - job_desc.compaction_key_range.start, - lowest_retain_lsn, - self.get_compaction_target_size(), - &self.gate, - self.cancel.clone(), - ctx, - ) - .await - .context("failed to create image layer writer") - .map_err(CompactionError::Other)?, - ) + Some(SplitImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + job_desc.compaction_key_range.start, + lowest_retain_lsn, + self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), + )) } else { None }; From d2d9946bab867e111434861e2c3facec0d62f417 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 1 Jul 2025 16:47:16 +0400 Subject: [PATCH 14/19] tests: override safekeeper ports in storcon DB (#12410) ## Problem We persist safekeeper host/port in the storcon DB after https://github.com/neondatabase/neon/pull/11712, so the storcon fails to ping safekeepers in the compatibility tests, where we start the cluster from the snapshot. PR also adds some small code improvements related to the test failure. - Closes: https://github.com/neondatabase/neon/issues/12339 ## Summary of changes - Update safekeeper ports in the storcon DB when starting the neon from the dir (snapshot) - Fail the response on all not-success codes (e.g. 3xx). Should not happen, but just to be more safe. - Add `neon_previous/` to .gitignore to make it easier to run compat tests. - Add missing EXPORT to the instruction for running compat tests --- .gitignore | 1 + safekeeper/client/src/mgmt_api.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 12 +++++++++--- test_runner/regress/test_compatibility.py | 2 ++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 70c7e96303..6574d7b9de 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ /tmp_check_cli __pycache__/ test_output/ +neon_previous/ .vscode .idea *.swp diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 2e46a7b529..b4bb193a4b 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -52,7 +52,7 @@ pub trait ResponseErrorMessageExt: Sized { impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { + if status.is_success() { return Ok(self); } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 508e3d8dd2..2031ec132e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -724,15 +724,21 @@ class NeonEnvBuilder: shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. - # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller - # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage + # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical. # So, from_repo_dir patches up the the storcon database. patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" assert not patch_script_path.exists() patch_script = "" + for ps in self.env.pageservers: - patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';\n" + + for sk in self.env.safekeepers: + patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 16ab2bb359..a4d2bf8d9b 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -76,6 +76,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -102,6 +103,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # export NEON_BIN=target/${BUILD_TYPE} # export POSTGRES_DISTRIB_DIR=pg_install # From 6d73cfa6085b321348a1defeb1f29c76238f3ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Tue, 1 Jul 2025 15:53:46 +0300 Subject: [PATCH 15/19] Support audit syslog over TLS (#12124) Add support to transport syslogs over TLS. Since TLS params essentially require passing host and port separately, add a boolean flag to the configuration template and also use the same `action` format for plaintext logs. This allows seamless transition. The plaintext host:port is picked from `AUDIT_LOGGING_ENDPOINT` (as earlier) and from `AUDIT_LOGGING_TLS_ENDPOINT`. The TLS host:port is used when defined and non-empty. `remote_endpoint` is split separately to hostname and port as required by `omfwd` module. Also the address parsing and config content generation are split to more testable functions with basic tests added. --- Cargo.lock | 7 + compute/compute-node.Dockerfile | 2 +- compute_tools/Cargo.toml | 1 + compute_tools/src/compute.rs | 14 +- .../compute_audit_rsyslog_template.conf | 36 ++- compute_tools/src/rsyslog.rs | 224 +++++++++++++++++- 6 files changed, 270 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e640e62909..4c9cfa97e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1305,6 +1305,7 @@ dependencies = [ "fail", "flate2", "futures", + "hostname-validator", "http 1.1.0", "indexmap 2.9.0", "itertools 0.10.5", @@ -2771,6 +2772,12 @@ dependencies = [ "windows", ] +[[package]] +name = "hostname-validator" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f558a64ac9af88b5ba400d99b579451af0d39c6d360980045b91aac966d705e2" + [[package]] name = "http" version = "0.2.9" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index bce2a28b8b..111e64d5d1 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1983,7 +1983,7 @@ RUN apt update && \ locales \ lsof \ procps \ - rsyslog \ + rsyslog-gnutls \ screen \ tcpdump \ $VERSION_INSTALLS && \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0a071c1ad1..1a03022d89 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,6 +27,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +hostname-validator = "1.1" indexmap.workspace = true itertools.workspace = true jsonwebtoken.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 9dcd4fc17c..fae76579d8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -759,10 +759,15 @@ impl ComputeNode { // Configure and start rsyslog for compliance audit logging match pspec.spec.audit_log_level { ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { - let remote_endpoint = + let remote_tls_endpoint = + std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string()); + let remote_plain_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + + if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() { + anyhow::bail!( + "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty" + ); } let log_directory_path = Path::new(&self.params.pgdata).join("log"); @@ -778,7 +783,8 @@ impl ComputeNode { log_directory_path.clone(), endpoint_id, project_id, - &remote_endpoint, + &remote_plain_endpoint, + &remote_tls_endpoint, )?; // Launch a background task to clean up the audit logs diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index 48b1a6f5c3..f072f652cf 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -10,7 +10,13 @@ input(type="imfile" File="{log_directory}/*.log" startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,") # the directory to store rsyslog state files -global(workDirectory="/var/log/rsyslog") +global( + workDirectory="/var/log/rsyslog" + DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt" +) + +# Whether the remote syslog receiver uses tls +set $.remote_syslog_tls = "{remote_syslog_tls}"; # Construct json, endpoint_id and project_id as additional metadata set $.json_log!endpoint_id = "{endpoint_id}"; @@ -21,5 +27,29 @@ set $.json_log!msg = $msg; template(name="PgAuditLog" type="string" string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%") -# Forward to remote syslog receiver (@@:;format -local5.info @@{remote_endpoint};PgAuditLog +# Forward to remote syslog receiver (over TLS) +if ( $syslogtag == 'pgaudit_log' ) then {{ + if ( $.remote_syslog_tls == 'true' ) then {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + StreamDriver="gtls" + StreamDriverMode="1" + StreamDriverAuthMode="x509/name" + StreamDriverPermittedPeers="{remote_syslog_host}" + StreamDriver.CheckExtendedKeyPurpose="on" + StreamDriver.PermitExpiredCerts="off" + ) + stop + }} else {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + ) + stop + }} +}} diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 3bc2e72b19..3ced0a5654 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -4,8 +4,10 @@ use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs::OpenOptions, io::Write}; +use url::{Host, Url}; use anyhow::{Context, Result, anyhow}; +use hostname_validator; use tracing::{error, info, instrument, warn}; const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; @@ -82,18 +84,84 @@ fn restart_rsyslog() -> Result<()> { Ok(()) } +fn parse_audit_syslog_address( + remote_plain_endpoint: &str, + remote_tls_endpoint: &str, +) -> Result<(String, u16, String)> { + let tls; + let remote_endpoint = if !remote_tls_endpoint.is_empty() { + tls = "true".to_string(); + remote_tls_endpoint + } else { + tls = "false".to_string(); + remote_plain_endpoint + }; + // Urlify the remote_endpoint, so parsing can be done with url::Url. + let url_str = format!("http://{remote_endpoint}"); + let url = Url::parse(&url_str).map_err(|err| { + anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}") + })?; + + let is_valid = url.scheme() == "http" + && url.path() == "/" + && url.query().is_none() + && url.fragment().is_none() + && url.username() == "" + && url.password().is_none(); + + if !is_valid { + return Err(anyhow!( + "Invalid address format {remote_endpoint}, expected host:port" + )); + } + let host = match url.host() { + Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(), + Some(Host::Ipv4(ip4)) => ip4.to_string(), + Some(Host::Ipv6(ip6)) => ip6.to_string(), + _ => return Err(anyhow!("Invalid host")), + }; + let port = url + .port() + .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?; + + Ok((host, port, tls)) +} + +fn generate_audit_rsyslog_config( + log_directory: String, + endpoint_id: &str, + project_id: &str, + remote_syslog_host: &str, + remote_syslog_port: u16, + remote_syslog_tls: &str, +) -> String { + format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + endpoint_id = endpoint_id, + project_id = project_id, + remote_syslog_host = remote_syslog_host, + remote_syslog_port = remote_syslog_port, + remote_syslog_tls = remote_syslog_tls + ) +} + pub fn configure_audit_rsyslog( log_directory: String, endpoint_id: &str, project_id: &str, remote_endpoint: &str, + remote_tls_endpoint: &str, ) -> Result<()> { - let config_content: String = format!( - include_str!("config_template/compute_audit_rsyslog_template.conf"), - log_directory = log_directory, - endpoint_id = endpoint_id, - project_id = project_id, - remote_endpoint = remote_endpoint + let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) = + parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap(); + let config_content = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + &remote_syslog_host, + remote_syslog_port, + &remote_syslog_tls, ); info!("rsyslog config_content: {}", config_content); @@ -258,6 +326,8 @@ pub fn launch_pgaudit_gc(log_directory: String) { mod tests { use crate::rsyslog::PostgresLogsRsyslogConfig; + use super::{generate_audit_rsyslog_config, parse_audit_syslog_address}; + #[test] fn test_postgres_logs_config() { { @@ -287,4 +357,146 @@ mod tests { assert!(res.is_err()); } } + + #[test] + fn test_parse_audit_syslog_address() { + { + // host:port format (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + ( + String::from("collector.host.tld"), + 5555, + String::from("false") + ) + ); + } + + { + // host:port format with ipv4 ip address (plaintext) + let parsed = parse_audit_syslog_address("10.0.0.1:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + (String::from("10.0.0.1"), 5555, String::from("false")) + ); + } + + { + // host:port format with ipv6 ip address (plaintext) + let parsed = + parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", ""); + assert_eq!( + parsed.unwrap(), + ( + String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"), + 5555, + String::from("false") + ) + ); + } + + { + // Only TLS host:port defined + let parsed = parse_audit_syslog_address("", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // tls host should take precedence, when both defined + let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // host without port (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld", ""); + assert!(parsed.is_err()); + } + + { + // port without host + let parsed = parse_audit_syslog_address(":5555", ""); + assert!(parsed.is_err()); + } + + { + // valid host with invalid port + let parsed = parse_audit_syslog_address("collector.host.tld:90001", ""); + assert!(parsed.is_err()); + } + + { + // invalid hostname with valid port + let parsed = parse_audit_syslog_address("-collector.host.tld:5555", ""); + assert!(parsed.is_err()); + } + + { + // parse error + let parsed = parse_audit_syslog_address("collector.host.tld:::5555", ""); + assert!(parsed.is_err()); + } + } + + #[test] + fn test_generate_audit_rsyslog_config() { + { + // plaintext version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5555; + let remote_syslog_tls = "false"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5555""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + + { + // TLS version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5556; + let remote_syslog_tls = "true"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5556""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + } } From 4932963bac31bac1f76ec6b39002e39c98292047 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 1 Jul 2025 14:03:34 +0100 Subject: [PATCH 16/19] [proxy]: dont log user errors from postgres (#12412) ## Problem #8843 User initiated sql queries are being classified as "postgres" errors, whereas they're really user errors. ## Summary of changes Classify user-initiated postgres errors as user errors if they are related to a sql query that we ran on their behalf. Do not log those errors. --- proxy/src/error.rs | 10 ------ proxy/src/serverless/backend.rs | 10 +++++- proxy/src/serverless/sql_over_http.rs | 48 ++++++++++++++++++++++----- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/proxy/src/error.rs b/proxy/src/error.rs index aa02b211d9..e880d63075 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -78,16 +78,6 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } -impl ReportableError for postgres_client::error::Error { - fn get_error_kind(&self) -> ErrorKind { - if self.as_db_error().is_some() { - ErrorKind::Postgres - } else { - ErrorKind::Compute - } - } -} - /// Flattens `Result>` into `Result`. pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 26269d0a6e..7708342ae3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -404,7 +404,15 @@ impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), + HttpConnError::PostgresConnectionError(p) => { + if p.as_db_error().is_some() { + // postgres rejected the connection + ErrorKind::Postgres + } else { + // couldn't even reach postgres + ErrorKind::Compute + } + } HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index b2eb801f5c..5d5e7bf83e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -22,7 +22,7 @@ use serde_json::Value; use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info}; +use tracing::{Level, debug, error, info}; use typed_json::json; use url::Url; use uuid::Uuid; @@ -390,12 +390,35 @@ pub(crate) async fn handle( let line = get(db_error, |db| db.line().map(|l| l.to_string())); let routine = get(db_error, |db| db.routine()); - tracing::info!( - kind=error_kind.to_metric_label(), - error=%e, - msg=message, - "forwarding error to user" - ); + match &e { + SqlOverHttpError::Postgres(e) + if e.as_db_error().is_some() && error_kind == ErrorKind::User => + { + // this error contains too much info, and it's not an error we care about. + if tracing::enabled!(Level::DEBUG) { + tracing::debug!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } else { + tracing::info!( + kind = error_kind.to_metric_label(), + error = "bad query", + "forwarding error to user" + ); + } + } + _ => { + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } + } json_response( e.get_http_status_code(), @@ -460,7 +483,15 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, - SqlOverHttpError::Postgres(p) => p.get_error_kind(), + // customer initiated SQL errors. + SqlOverHttpError::Postgres(p) => { + if p.as_db_error().is_some() { + ErrorKind::User + } else { + ErrorKind::Compute + } + } + // proxy initiated SQL errors. SqlOverHttpError::InternalPostgres(p) => { if p.as_db_error().is_some() { ErrorKind::Service @@ -468,6 +499,7 @@ impl ReportableError for SqlOverHttpError { ErrorKind::Compute } } + // postgres returned a bad row format that we couldn't parse. SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, SqlOverHttpError::Cancelled(c) => c.get_error_kind(), } From 5d499f62cd31bdf884844176f682365a972e91a6 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Jul 2025 11:43:09 +0200 Subject: [PATCH 17/19] fix(tests): periodic and immediate gc is effectively a no-op in tests The introduction of the default lease deadline[^1] feature makes it so that after PS restart, `.timeline_gc()` calls in Python tests are no-ops for 10 minute after pageserver startup: the `gc_iteration()` bails with `Skipping GC because lsn lease deadline is not reached`. I did some impact analysis in the following PR. About 30 Python tests are affected: - https://github.com/neondatabase/neon/pull/12411 Rust tests that don't explicitly enable periodic GC or invoke GC manually are unaffected because we disable periodic GC by default in the `TenantHarness`'s tenant config. Two tests explicitly did `start_paused=true` + `tokio::time::advance()`, but it would add cognitive and code bloat to each existing and future test case that uses TenantHarness if we take that route. So, this PR disables the default lease deadline feature in all tests. refs - [^1]: PR that introduced default lease deadline: https://github.com/neondatabase/neon/pull/9055/files - fixes https://databricks.atlassian.net/browse/LKB-92 --- pageserver/src/tenant.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index fcb18e8553..e5caf0dcb7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3101,7 +3101,19 @@ impl TenantShard { return Ok(GcResult::default()); } - if conf.is_gc_blocked_by_lsn_lease_deadline() { + // Skip GC if we're within lease deadline. + // + // Rust & Python tests set single-digit second gc_period and/or + // do immediate gc via mgmt API. The lease deadline is hard-coded to + // 10min, which would make most if not all tests skip GC here. + // Rust tests could in theory tokio::time::advance, but Python + // tests have no such options. + // Since lease deadline is a crutch we hopefully eventually replace + // with durable leases, take a shortcut here and skip lease deadline check + // for all tests. + // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 + let ignore_lease_deadline = cfg!(test) || cfg!(feature = "testing"); + if !ignore_lease_deadline && conf.is_gc_blocked_by_lsn_lease_deadline() { info!("Skipping GC because lsn lease deadline is not reached"); return Ok(GcResult::default()); } @@ -6668,17 +6680,13 @@ mod tests { tline.freeze_and_flush().await.map_err(|e| e.into()) } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") .await? .load() .await; - // Advance to the lsn lease deadline so that GC is not blocked by - // initial transition into AttachedSingle. - tokio::time::advance(tenant.get_lsn_lease_length()).await; - tokio::time::resume(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -9377,17 +9385,13 @@ mod tests { Ok(()) } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_lsn_lease") .await .unwrap() .load() .await; - // Advance to the lsn lease deadline so that GC is not blocked by - // initial transition into AttachedSingle. - tokio::time::advance(tenant.get_lsn_lease_length()).await; - tokio::time::resume(); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); From 8143270c4fa6d6e5eefb5f98984a390841460aa8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Jul 2025 14:18:23 +0200 Subject: [PATCH 18/19] fix test_readonly_node_gc, it (among other things) tests the lease deadline --- libs/pageserver_api/src/models.rs | 1 + pageserver/src/tenant.rs | 64 +++++++++++++++++++++-- pageserver/src/tenant/mgr.rs | 9 +++- pageserver/src/tenant/tasks.rs | 1 + test_runner/fixtures/pageserver/http.py | 3 +- test_runner/regress/test_readonly_node.py | 8 +-- 6 files changed, 76 insertions(+), 10 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 16545364c1..5f36e8d2b7 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1735,6 +1735,7 @@ pub enum DownloadRemoteLayersTaskState { #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, + pub ignore_lease_deadline: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e5caf0dcb7..012710a0ed 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1145,6 +1145,22 @@ pub(crate) enum LoadConfigError { NotFound(Utf8PathBuf), } +pub(crate) enum IgnoreLeaseDeadline { + Default, + No, + Yes, +} + +impl From> for IgnoreLeaseDeadline { + fn from(value: Option) -> Self { + match value { + None => IgnoreLeaseDeadline::Default, + Some(false) => IgnoreLeaseDeadline::No, + Some(true) => IgnoreLeaseDeadline::Yes, + } + } +} + impl TenantShard { /// Yet another helper for timeline initialization. /// @@ -3076,6 +3092,7 @@ impl TenantShard { target_timeline_id: Option, horizon: u64, pitr: Duration, + ignore_lease_deadline: IgnoreLeaseDeadline, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -3112,7 +3129,11 @@ impl TenantShard { // with durable leases, take a shortcut here and skip lease deadline check // for all tests. // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 - let ignore_lease_deadline = cfg!(test) || cfg!(feature = "testing"); + let ignore_lease_deadline = match ignore_lease_deadline { + IgnoreLeaseDeadline::Default => cfg!(test) || cfg!(feature = "testing"), + IgnoreLeaseDeadline::No => false, + IgnoreLeaseDeadline::Yes => true, + }; if !ignore_lease_deadline && conf.is_gc_blocked_by_lsn_lease_deadline() { info!("Skipping GC because lsn lease deadline is not reached"); return Ok(GcResult::default()); @@ -6701,6 +6722,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6814,6 +6836,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6874,6 +6897,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6908,6 +6932,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -7189,7 +7214,14 @@ mod tests { // this doesn't really need to use the timeline_id target, but it is closer to what it // originally was. let res = tenant - .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) + .gc_iteration( + Some(timeline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + ctx, + ) .await?; assert_eq!(res.layers_removed, 0, "this never removes anything"); @@ -7807,7 +7839,14 @@ mod tests { // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } @@ -7898,7 +7937,14 @@ mod tests { tline.freeze_and_flush().await?; tline.compact(&cancel, EnumSet::default(), &ctx).await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } @@ -8234,7 +8280,14 @@ mod tests { ) .await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } } @@ -9455,6 +9508,7 @@ mod tests { Some(TIMELINE_ID), 0, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 95f5c60170..81cd6f4212 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2365,7 +2365,14 @@ impl TenantManager { #[allow(unused_mut)] let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .gc_iteration( + Some(timeline_id), + gc_horizon, + pitr, + crate::tenant::IgnoreLeaseDeadline::from(gc_req.ignore_lease_deadline), + &cancel, + &ctx, + ) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..e4890efdb0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -406,6 +406,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { None, gc_horizon, tenant.get_pitr_interval(), + crate::tenant::IgnoreLeaseDeadline::Default, &cancel, &ctx, )) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d9037f2d08..d60fc34fa3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -695,6 +695,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, gc_horizon: int | None, + ignore_lease_deadline: bool | None = None, ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -707,7 +708,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", - json={"gc_horizon": gc_horizon}, + json={"gc_horizon": gc_horizon, "ignore_lease_deadline": ignore_lease_deadline}, ) log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index ee934a900d..98877ad7f9 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -201,11 +201,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() layers_guarded_before_gc = get_layers_protected_by_lease( - client, shard, env.initial_timeline, lease_lsn=lsn + client, shard, env.initial_timeline, lease_lsn=lease_lsn + ) + gc_result = client.timeline_gc( + shard, env.initial_timeline, 0, ignore_lease_deadline=False ) - gc_result = client.timeline_gc(shard, env.initial_timeline, 0) layers_guarded_after_gc = get_layers_protected_by_lease( - client, shard, env.initial_timeline, lease_lsn=lsn + client, shard, env.initial_timeline, lease_lsn=lease_lsn ) # Note: cannot assert on `layers_removed` here because it could be layers From eb267de255e245cceb5bbcbfb3cdf09a4cea708a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 2 Jul 2025 14:18:23 +0200 Subject: [PATCH 19/19] use initial lease period based on config This change would make sense as a standalone commit. --- pageserver/src/tenant.rs | 24 ++++++++++++++++-------- pageserver/src/tenant/mgr.rs | 10 +++++----- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 012710a0ed..39d0fcd852 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -34,7 +34,7 @@ use once_cell::sync::Lazy; pub use pageserver_api::models::TenantState; use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ - CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, }; use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; @@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf { impl AttachedTenantConf { fn new( + conf: &'static PageServerConf, tenant_conf: pageserver_api::models::TenantConfig, location: AttachedLocationConfig, ) -> Self { @@ -191,9 +192,7 @@ impl AttachedTenantConf { let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single { Some( tokio::time::Instant::now() - + tenant_conf - .lsn_lease_length - .unwrap_or(LsnLease::DEFAULT_LENGTH), + + TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf), ) } else { // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale @@ -208,10 +207,13 @@ impl AttachedTenantConf { } } - fn try_from(location_conf: LocationConf) -> anyhow::Result { + fn try_from( + conf: &'static PageServerConf, + location_conf: LocationConf, + ) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => { - Ok(Self::new(location_conf.tenant_conf, *attach_conf)) + Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { anyhow::bail!( @@ -4234,10 +4236,16 @@ impl TenantShard { } pub fn get_lsn_lease_length(&self) -> Duration { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf) + } + + pub fn get_lsn_lease_length_impl( + conf: &'static PageServerConf, + tenant_conf: &pageserver_api::models::TenantConfig, + ) -> Duration { tenant_conf .lsn_lease_length - .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + .unwrap_or(conf.default_tenant_conf.lsn_lease_length) } pub fn get_timeline_offloading_enabled(&self) -> bool { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 81cd6f4212..2888fdeac0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -664,7 +664,7 @@ pub async fn init_tenant_mgr( tenant_shard_id, &tenant_dir_path, resources.clone(), - AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf), shard_identity, Some(init_order.clone()), SpawnMode::Lazy, @@ -842,7 +842,7 @@ impl TenantManager { // take our fast path and just provide the updated configuration // to the tenant. tenant.set_new_location_config( - AttachedTenantConf::try_from(new_location_config.clone()) + AttachedTenantConf::try_from(self.conf, new_location_config.clone()) .map_err(UpsertLocationError::BadRequest)?, ); @@ -1046,7 +1046,7 @@ impl TenantManager { // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. - let attached_conf = AttachedTenantConf::try_from(new_location_config) + let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config) .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( @@ -1250,7 +1250,7 @@ impl TenantManager { tenant_shard_id, &tenant_path, self.resources.clone(), - AttachedTenantConf::try_from(config)?, + AttachedTenantConf::try_from(self.conf, config)?, shard_identity, None, SpawnMode::Eager, @@ -2131,7 +2131,7 @@ impl TenantManager { tenant_shard_id, &tenant_path, self.resources.clone(), - AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?, + AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?, shard_identity, None, SpawnMode::Eager,