From 59cd7f9a50babd5e42027de0f5b327484beb54e1 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Thu, 24 Jul 2025 17:16:37 -0400 Subject: [PATCH] move to a separate key instead of reusing dbdir Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/key.rs | 9 ++ libs/pageserver_api/src/models.rs | 4 +- pageserver/src/pgdatadir_mapping.rs | 125 +++++++++++------- pageserver/src/tenant.rs | 3 +- .../src/tenant/timeline/import_pgdata/flow.rs | 1 - pageserver/src/walingest.rs | 2 +- 6 files changed, 90 insertions(+), 54 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 4e8fabfa72..6277f8b775 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -519,6 +519,15 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } +pub const REL_DIR_MIGRATION_KEY: Key = Key { + field1: REL_DIR_KEY_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + #[inline(always)] pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key { Key { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 9a2478ae16..67d4685183 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -945,7 +945,9 @@ impl TenantConfig { patch .basebackup_cache_enabled .apply(&mut basebackup_cache_enabled); - patch.rel_size_v1_access_disabled.apply(&mut rel_size_v1_access_disabled); + patch + .rel_size_v1_access_disabled + .apply(&mut rel_size_v1_access_disabled); Ok(Self { checkpoint_distance, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index e77b86b6cc..e513f452bd 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -16,10 +16,10 @@ use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; use pageserver_api::key::{ - AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, - TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, - rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, relmap_file_key, - repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, + REL_DIR_MIGRATION_KEY, RelDirExists, TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, + rel_dir_to_key, rel_key_range, rel_size_to_key, rel_tag_sparse_key, rel_tag_sparse_key_range, + relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; @@ -691,7 +691,10 @@ impl Timeline { return Ok(false); } - let v2_status = dbdir.get_persistent_rel_size_v2_status(); + let migration_history = version.sparse_get(self, REL_DIR_MIGRATION_KEY, ctx).await?; + let migration_history = RelDirMigrationHistory::from_bytes(migration_history) + .context("failed to deserialize rel dir migration history")?; + let v2_status = migration_history.status; match v2_status { RelSizeMigration::Legacy => { @@ -834,8 +837,10 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - let dbdir = DbDirectory::des(&version.get(self, DBDIR_KEY, ctx).await?)?; - let v2_status = dbdir.get_persistent_rel_size_v2_status(); + let reldir_migration_history = version.sparse_get(self, REL_DIR_MIGRATION_KEY, ctx).await?; + let reldir_migration_history = RelDirMigrationHistory::from_bytes(reldir_migration_history) + .context("failed to deserialize rel dir migration history")?; + let v2_status = reldir_migration_history.status; match v2_status { RelSizeMigration::Legacy => { @@ -1802,7 +1807,6 @@ impl DatadirModification<'_> { pub fn init_empty(&mut self) -> anyhow::Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), - rel_dir_migration_status: None, })?; self.pending_directory_entries .push((DirectoryKind::Db, MetricsUpdate::Set(0))); @@ -2086,12 +2090,12 @@ impl DatadirModification<'_> { /// field. pub(crate) fn maybe_enable_rel_size_v2( &mut self, - dbdir: &DbDirectory, + migration_history: &RelDirMigrationHistory, is_create: bool, ) -> anyhow::Result { // TODO: define the behavior of the tenant-level config flag and use feature flag to enable this feature let expected_status = self.tline.get_rel_size_v2_expected_state(); - let persistent_status = dbdir.get_persistent_rel_size_v2_status(); + let persistent_status = migration_history.status.clone(); // Only initialize the v2 keyspace on new relation creation. No initialization // during `timeline_create` (TODO: fix this, we should allow, but currently it @@ -2166,9 +2170,14 @@ impl DatadirModification<'_> { let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; + let reldir_migration_history = self.sparse_get(REL_DIR_MIGRATION_KEY, ctx).await?; + let reldir_migration_history = RelDirMigrationHistory::from_bytes(reldir_migration_history) + .context("failed to deserialize rel dir migration history") + .map_err(WalIngestErrorKind::RelSizeV2Error)?; + let v2_mode = self - .maybe_enable_rel_size_v2(&dbdir, false) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(&reldir_migration_history, false) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; let r = dbdir.dbdirs.insert((spcnode, dbnode), true); if r.is_none() || r == Some(false) { @@ -2406,9 +2415,14 @@ impl DatadirModification<'_> { // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; + let reldir_migration_history = self.sparse_get(REL_DIR_MIGRATION_KEY, ctx).await?; + let mut reldir_migration_history = + RelDirMigrationHistory::from_bytes(reldir_migration_history) + .context("failed to deserialize rel dir migration history") + .map_err(WalIngestErrorKind::RelSizeV2Error)?; let mut is_dbdir_dirty = false; - let mut is_reldirv2_index_part_dirty = false; + let mut is_reldirv2_status_dirty = false; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { @@ -2425,8 +2439,8 @@ impl DatadirModification<'_> { }; let mut v2_mode = self - .maybe_enable_rel_size_v2(&dbdir, true) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(&reldir_migration_history, true) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; if v2_mode.initialize { if let Err(e) = self.initialize_rel_size_v2_keyspace(ctx, &dbdir).await { @@ -2434,20 +2448,16 @@ impl DatadirModification<'_> { // TODO: circuit breaker so that it won't retry forever } else { v2_mode.current_status = RelSizeMigration::Migrating; - let migration_history = dbdir.rel_dir_migration_status.get_or_insert_default(); - migration_history.status = Some(RelSizeMigration::Migrating); - migration_history.v2_enabled_at = Some(self.lsn); - is_dbdir_dirty = true; - is_reldirv2_index_part_dirty = true; + reldir_migration_history.status = RelSizeMigration::Migrating; + reldir_migration_history.v2_enabled_at = Some(self.lsn); + is_reldirv2_status_dirty = true; } } if v2_mode.disable_v1 { v2_mode.current_status = RelSizeMigration::Migrated; - let migration_history = dbdir.rel_dir_migration_status.get_or_insert_default(); - migration_history.status = Some(RelSizeMigration::Migrated); - migration_history.v1_disabled_at = Some(self.lsn); - is_dbdir_dirty = true; - is_reldirv2_index_part_dirty = true; + reldir_migration_history.status = RelSizeMigration::Migrated; + reldir_migration_history.v1_disabled_at = Some(self.lsn); + is_reldirv2_status_dirty = true; } if is_dbdir_dirty { @@ -2455,13 +2465,22 @@ impl DatadirModification<'_> { self.put(DBDIR_KEY, Value::Image(buf.into())); } - if is_reldirv2_index_part_dirty { + if is_reldirv2_status_dirty { self.tline .update_rel_size_v2_status( - dbdir.get_persistent_rel_size_v2_status(), - dbdir.get_persistent_rel_size_v2_migrated_at(), + reldir_migration_history.status.clone(), + reldir_migration_history.v1_disabled_at, ) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .map_err(WalIngestErrorKind::RelSizeV2Error)?; + self.put( + REL_DIR_MIGRATION_KEY, + Value::Image( + reldir_migration_history + .encode() + .context("failed to serialize rel dir migration history") + .map_err(WalIngestErrorKind::RelSizeV2Error)?, + ), + ); } if v2_mode.current_status != RelSizeMigration::Migrated { @@ -2623,10 +2642,13 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { - let dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; + let reldir_migration_history = self.sparse_get(REL_DIR_MIGRATION_KEY, ctx).await?; + let reldir_migration_history = RelDirMigrationHistory::from_bytes(reldir_migration_history) + .context("failed to deserialize rel dir migration history") + .map_err(WalIngestErrorKind::RelSizeV2Error)?; let v2_mode = self - .maybe_enable_rel_size_v2(&dbdir, false) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(&reldir_migration_history, false) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; match v2_mode.current_status { RelSizeMigration::Legacy => { self.put_rel_drop_v1(drop_relations, ctx).await?; @@ -3200,32 +3222,35 @@ impl Version<'_> { //--- Metadata structs stored in key-value pairs in the repository. #[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub(crate) struct RelSizeMigrationHistory { - pub(crate) status: Option, +pub(crate) struct RelDirMigrationHistory { + pub(crate) status: RelSizeMigration, pub(crate) v2_enabled_at: Option, pub(crate) v1_disabled_at: Option, } +impl RelDirMigrationHistory { + pub(crate) fn from_bytes(bytes: Option) -> Result { + match bytes { + Some(bytes) => { + if bytes.is_empty() { + return Ok(Self::default()); + } + let history = serde_json::from_slice(&bytes)?; + Ok(history) + } + None => Ok(Self::default()), + } + } + + pub(crate) fn encode(&self) -> Result { + serde_json::to_vec(self).map(Bytes::from) + } +} + #[derive(Debug, Serialize, Deserialize)] pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, - pub(crate) rel_dir_migration_status: Option, -} - -impl DbDirectory { - pub(crate) fn get_persistent_rel_size_v2_status(&self) -> RelSizeMigration { - self.rel_dir_migration_status - .as_ref() - .and_then(|x| x.status.clone()) - .unwrap_or(RelSizeMigration::Legacy) - } - - pub(crate) fn get_persistent_rel_size_v2_migrated_at(&self) -> Option { - self.rel_dir_migration_status - .as_ref() - .and_then(|x| x.v1_disabled_at) - } } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4a22f799ce..c11850fbd0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5090,7 +5090,8 @@ impl TenantShard { src_timeline.pg_version, ); - let (rel_size_v2_status, rel_size_migrated_at) = src_timeline.get_rel_size_v2_cached_status(); + let (rel_size_v2_status, rel_size_migrated_at) = + src_timeline.get_rel_size_v2_cached_status(); let (uninitialized_timeline, _timeline_ctx) = self .prepare_new_timeline( dst_id, diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c9eefca2a5..d471e9fc69 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -177,7 +177,6 @@ impl Planner { .iter() .map(|db| ((db.spcnode, db.dboid), true)) .collect(), - rel_dir_migration_status: None, })?); self.tasks .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into()); diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3acf98b020..6a7f126daf 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -135,7 +135,7 @@ pub enum WalIngestErrorKind { #[error(transparent)] EncodeAuxFileError(anyhow::Error), #[error(transparent)] - MaybeRelSizeV2Error(anyhow::Error), + RelSizeV2Error(anyhow::Error), #[error("timeline shutting down")] Cancelled,