From db24ba95d1991a90c50d25f322eb07435cf31936 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 12 Jun 2025 18:15:02 +0100 Subject: [PATCH] pagserver: always persist shard identity (#12217) ## Problem The location config (which includes the stripe size) is stored on pageserver disk. For unsharded tenants we [do not include the shard identity in the serialized description](https://github.com/neondatabase/neon/blob/ad88ec9257b84d6633be86c5a970709146331ad4/pageserver/src/tenant/config.rs#L64-L66). When the pageserver restarts, it reads that configuration and will use the stripe size from there and rely on storcon input from reattach for generation and mode. The default deserialization is ShardIdentity::unsharded. This has the new default stripe size of 2048. Hence, for unsharded tenants we can be running with a stripe size different from that the one in the storcon observed state. This is not a problem until we shard split without specifying a stripe size (i.e. manual splits via the UI or storcon_cli). When that happens the new shards will use the 2048 stripe size until storcon realises and switches them back. At that point it's too late, since we've ingested data with the wrong stripe sizes. ## Summary of changes Ideally, we would always have the full shard identity on disk. To achieve this over two releases we do: 1. Always persist the shard identity in the location config on the PS. 2. Storage controller includes the stripe size to use in the re attach response. After the first release, we will start persisting correct stripe sizes for any tenant shard that the storage controller explicitly sends a location_conf. After the second release, the re-attach change kicks in and we'll persist the shard identity for all shards. --- libs/pageserver_api/src/upcall_api.rs | 6 +++- pageserver/src/tenant/config.rs | 11 ++++++-- pageserver/src/tenant/mgr.rs | 40 ++++++++++++++++----------- storage_controller/src/service.rs | 2 ++ 4 files changed, 40 insertions(+), 19 deletions(-) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 4dce5f7817..e2de02eea0 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -9,7 +9,7 @@ use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; use crate::models::{LocationConfigMode, ShardImportStatus}; -use crate::shard::TenantShardId; +use crate::shard::{ShardStripeSize, TenantShardId}; /// Upcall message sent by the pageserver to the configured `control_plane_api` on /// startup. @@ -36,6 +36,10 @@ pub struct ReAttachResponseTenant { /// Default value only for backward compat: this field should be set #[serde(default = "default_mode")] pub mode: LocationConfigMode, + + // Default value only for backward compat: this field should be set + #[serde(default = "ShardStripeSize::default")] + pub stripe_size: ShardStripeSize, } #[derive(Serialize, Deserialize)] pub struct ReAttachResponse { diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index bf82fc8df8..030b43a020 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -61,8 +61,8 @@ pub(crate) struct LocationConf { /// The detailed shard identity. This structure is already scoped within /// a TenantShardId, but we need the full ShardIdentity to enable calculating /// key->shard mappings. + // TODO(vlad): Remove this default once all configs have a shard identity on disk. #[serde(default = "ShardIdentity::unsharded")] - #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")] pub(crate) shard: ShardIdentity, /// The pan-cluster tenant configuration, the same on all locations @@ -149,7 +149,12 @@ impl LocationConf { /// For use when attaching/re-attaching: update the generation stored in this /// structure. If we were in a secondary state, promote to attached (posession /// of a fresh generation implies this). - pub(crate) fn attach_in_generation(&mut self, mode: AttachmentMode, generation: Generation) { + pub(crate) fn attach_in_generation( + &mut self, + mode: AttachmentMode, + generation: Generation, + stripe_size: ShardStripeSize, + ) { match &mut self.mode { LocationMode::Attached(attach_conf) => { attach_conf.generation = generation; @@ -163,6 +168,8 @@ impl LocationConf { }) } } + + self.shard.stripe_size = stripe_size; } pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 766f846827..76937dd959 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -129,7 +129,7 @@ pub(crate) enum ShardSelector { /// /// This represents the subset of a LocationConfig that we receive during re-attach. pub(crate) enum TenantStartupMode { - Attached((AttachmentMode, Generation)), + Attached((AttachmentMode, Generation, ShardStripeSize)), Secondary, } @@ -143,15 +143,21 @@ impl TenantStartupMode { match (rart.mode, rart.r#gen) { (LocationConfigMode::Detached, _) => None, (LocationConfigMode::Secondary, _) => Some(Self::Secondary), - (LocationConfigMode::AttachedMulti, Some(g)) => { - Some(Self::Attached((AttachmentMode::Multi, Generation::new(g)))) - } - (LocationConfigMode::AttachedSingle, Some(g)) => { - Some(Self::Attached((AttachmentMode::Single, Generation::new(g)))) - } - (LocationConfigMode::AttachedStale, Some(g)) => { - Some(Self::Attached((AttachmentMode::Stale, Generation::new(g)))) - } + (LocationConfigMode::AttachedMulti, Some(g)) => Some(Self::Attached(( + AttachmentMode::Multi, + Generation::new(g), + rart.stripe_size, + ))), + (LocationConfigMode::AttachedSingle, Some(g)) => Some(Self::Attached(( + AttachmentMode::Single, + Generation::new(g), + rart.stripe_size, + ))), + (LocationConfigMode::AttachedStale, Some(g)) => Some(Self::Attached(( + AttachmentMode::Stale, + Generation::new(g), + rart.stripe_size, + ))), _ => { tracing::warn!( "Received invalid re-attach state for tenant {}: {rart:?}", @@ -319,9 +325,11 @@ fn emergency_generations( Some(( *tid, match &lc.mode { - LocationMode::Attached(alc) => { - TenantStartupMode::Attached((alc.attach_mode, alc.generation)) - } + LocationMode::Attached(alc) => TenantStartupMode::Attached(( + alc.attach_mode, + alc.generation, + ShardStripeSize::default(), + )), LocationMode::Secondary(_) => TenantStartupMode::Secondary, }, )) @@ -365,7 +373,7 @@ async fn init_load_generations( .iter() .flat_map(|(id, start_mode)| { match start_mode { - TenantStartupMode::Attached((_mode, generation)) => Some(generation), + TenantStartupMode::Attached((_mode, generation, _stripe_size)) => Some(generation), TenantStartupMode::Secondary => None, } .map(|gen_| (*id, *gen_)) @@ -585,7 +593,7 @@ pub async fn init_tenant_mgr( location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } } - Some(TenantStartupMode::Attached((attach_mode, generation))) => { + Some(TenantStartupMode::Attached((attach_mode, generation, stripe_size))) => { let old_gen_higher = match &location_conf.mode { LocationMode::Attached(AttachedLocationConfig { generation: old_generation, @@ -609,7 +617,7 @@ pub async fn init_tenant_mgr( // local disk content: demote to secondary rather than detaching. location_conf.mode = LocationMode::Secondary(DEFAULT_SECONDARY_CONF); } else { - location_conf.attach_in_generation(*attach_mode, *generation); + location_conf.attach_in_generation(*attach_mode, *generation, *stripe_size); } } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 06318a01b5..5e59d776ab 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2267,6 +2267,7 @@ impl Service { // fail, and start from scratch, so it doesn't make sense for us to try and preserve // the stale/multi states at this point. mode: LocationConfigMode::AttachedSingle, + stripe_size: shard.shard.stripe_size, }); shard.generation = std::cmp::max(shard.generation, Some(new_gen)); @@ -2300,6 +2301,7 @@ impl Service { id: *tenant_shard_id, r#gen: None, mode: LocationConfigMode::Secondary, + stripe_size: shard.shard.stripe_size, }); // We must not update observed, because we have no guarantee that our