diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 843ead807d..887bfc2c76 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -571,6 +571,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'basebackup_cache_enabled' as bool")?, + rel_size_v1_access_disabled: settings + .remove("rel_size_v1_access_disabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'rel_size_v1_access_disabled' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2a8d05f51e..35f33621d7 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -651,6 +651,9 @@ pub struct TenantConfigToml { // FIXME: Remove skip_serializing_if when the feature is stable. #[serde(skip_serializing_if = "std::ops::Not::not")] pub basebackup_cache_enabled: bool, + + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub rel_size_v1_access_disabled: bool, } pub mod defaults { @@ -959,6 +962,7 @@ impl Default for TenantConfigToml { sampling_ratio: None, relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, basebackup_cache_enabled: false, + rel_size_v1_access_disabled: false, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 230c1f46ea..9a2478ae16 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -646,6 +646,8 @@ pub struct TenantConfigPatch { pub relsize_snapshot_cache_capacity: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub basebackup_cache_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub rel_size_v1_access_disabled: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -783,6 +785,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub basebackup_cache_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub rel_size_v1_access_disabled: Option, } impl TenantConfig { @@ -830,6 +835,7 @@ impl TenantConfig { mut sampling_ratio, mut relsize_snapshot_cache_capacity, mut basebackup_cache_enabled, + mut rel_size_v1_access_disabled, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -939,6 +945,7 @@ impl TenantConfig { patch .basebackup_cache_enabled .apply(&mut basebackup_cache_enabled); + patch.rel_size_v1_access_disabled.apply(&mut rel_size_v1_access_disabled); Ok(Self { checkpoint_distance, @@ -980,6 +987,7 @@ impl TenantConfig { sampling_ratio, relsize_snapshot_cache_capacity, basebackup_cache_enabled, + rel_size_v1_access_disabled, }) } @@ -1094,6 +1102,9 @@ impl TenantConfig { basebackup_cache_enabled: self .basebackup_cache_enabled .unwrap_or(global_conf.basebackup_cache_enabled), + rel_size_v1_access_disabled: self + .rel_size_v1_access_disabled + .unwrap_or(global_conf.rel_size_v1_access_disabled), } } } @@ -1526,12 +1537,15 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +/// The state of the rel size migration. This is persisted in the DbDir key and index part. Do not change without considering +/// compatibility. +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub enum RelSizeMigration { /// The tenant is using the old rel_size format. /// Note that this enum is persisted as `Option` in the index part, so /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + #[default] Legacy, /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are /// persisted in the storage. The read path will read both formats and validate them. diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 669eeffa32..8eb98759a1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -484,7 +484,7 @@ async fn build_timeline_info_common( *timeline.get_applied_gc_cutoff_lsn(), ); - let (rel_size_migration, rel_size_migrated_at) = timeline.get_rel_size_v2_status(); + let (rel_size_migration, rel_size_migrated_at) = timeline.get_rel_size_v2_cached_status(); let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fd54b3e31f..e77b86b6cc 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -685,12 +685,13 @@ impl Timeline { // then check if the database was already initialized. // get_rel_exists can be called before dbdir is created. let buf = version.get(self, DBDIR_KEY, ctx).await?; - let dbdirs = DbDirectory::des(&buf)?.dbdirs; + let dbdir = DbDirectory::des(&buf)?; + let dbdirs = &dbdir.dbdirs; if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) { return Ok(false); } - let (v2_status, migrated_lsn) = self.get_rel_size_v2_status(); + let v2_status = dbdir.get_persistent_rel_size_v2_status(); match v2_status { RelSizeMigration::Legacy => { @@ -699,15 +700,6 @@ impl Timeline { .await?; Ok(v1_exists) } - RelSizeMigration::Migrating | RelSizeMigration::Migrated - if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) => - { - // For requests below the migrated LSN, we still use the v1 read path. - let v1_exists = self - .get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx) - .await?; - Ok(v1_exists) - } RelSizeMigration::Migrating => { let v1_exists = self .get_rel_exists_in_reldir_v1(tag, version, deserialized_reldir_v1, ctx) @@ -720,7 +712,7 @@ impl Timeline { "inconsistent v1/v2 reldir keyspace for rel {}: v1_exists={}, v2_exists={}", tag, v1_exists, - v2_exists + v2_exists, ); } Err(e) if e.is_cancel() => { @@ -789,6 +781,10 @@ impl Timeline { key )) })?; + if key.field6 == 0 { + // The key that determines the current migration status, skip it. + continue; + } if key.field6 != 1 { return Err(PageReconstructError::Other(anyhow::anyhow!( "invalid reldir key: field6 != 1, {}", @@ -838,20 +834,14 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result, PageReconstructError> { - let (v2_status, migrated_lsn) = self.get_rel_size_v2_status(); + let dbdir = DbDirectory::des(&version.get(self, DBDIR_KEY, ctx).await?)?; + let v2_status = dbdir.get_persistent_rel_size_v2_status(); match v2_status { RelSizeMigration::Legacy => { let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; Ok(rels_v1) } - RelSizeMigration::Migrating | RelSizeMigration::Migrated - if version.get_lsn() < migrated_lsn.unwrap_or(Lsn(0)) => - { - // For requests below the migrated LSN, we still use the v1 read path. - let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; - Ok(rels_v1) - } RelSizeMigration::Migrating => { let rels_v1 = self.list_rels_v1(spcnode, dbnode, version, ctx).await?; let rels_v2_res = self.list_rels_v2(spcnode, dbnode, version, ctx).await; @@ -1730,6 +1720,8 @@ pub struct RelDirMode { current_status: RelSizeMigration, // Whether we should initialize the v2 keyspace or not. initialize: bool, + // Whether we should disable v1 access starting this LSNor not + disable_v1: bool, } impl DatadirModification<'_> { @@ -1810,6 +1802,7 @@ impl DatadirModification<'_> { pub fn init_empty(&mut self) -> anyhow::Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), + rel_dir_migration_status: None, })?; self.pending_directory_entries .push((DirectoryKind::Db, MetricsUpdate::Set(0))); @@ -2091,44 +2084,71 @@ impl DatadirModification<'_> { /// /// As this function is only used on the write path, we do not need to read the migrated_at /// field. - pub fn maybe_enable_rel_size_v2(&mut self, is_create: bool) -> anyhow::Result { + pub(crate) fn maybe_enable_rel_size_v2( + &mut self, + dbdir: &DbDirectory, + is_create: bool, + ) -> anyhow::Result { // TODO: define the behavior of the tenant-level config flag and use feature flag to enable this feature + let expected_status = self.tline.get_rel_size_v2_expected_state(); + let persistent_status = dbdir.get_persistent_rel_size_v2_status(); - let (status, _) = self.tline.get_rel_size_v2_status(); - let config = self.tline.get_rel_size_v2_enabled(); - match (config, status) { - (false, RelSizeMigration::Legacy) => { + // Only initialize the v2 keyspace on new relation creation. No initialization + // during `timeline_create` (TODO: fix this, we should allow, but currently it + // hits consistency issues). + let can_update = is_create && !self.is_importing_pgdata; + + match (expected_status, persistent_status) { + (RelSizeMigration::Legacy, RelSizeMigration::Legacy) => { // tenant config didn't enable it and we didn't write any reldir_v2 key yet Ok(RelDirMode { current_status: RelSizeMigration::Legacy, initialize: false, + disable_v1: false, }) } - (false, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { - // index_part already persisted that the timeline has enabled rel_size_v2 + ( + RelSizeMigration::Legacy, + current_status @ RelSizeMigration::Migrating + | current_status @ RelSizeMigration::Migrated, + ) => { + // already persisted that the timeline has enabled rel_size_v2, cannot rollback Ok(RelDirMode { - current_status: status, + current_status, initialize: false, + disable_v1: false, }) } - (true, RelSizeMigration::Legacy) => { + ( + expected_status @ RelSizeMigration::Migrating + | expected_status @ RelSizeMigration::Migrated, + RelSizeMigration::Legacy, + ) => { // The first time we enable it, we need to persist it in `index_part.json` // The caller should update the reldir status once the initialization is done. - // - // Only initialize the v2 keyspace on new relation creation. No initialization - // during `timeline_create` (TODO: fix this, we should allow, but currently it - // hits consistency issues). + Ok(RelDirMode { current_status: RelSizeMigration::Legacy, - initialize: is_create && !self.is_importing_pgdata, + initialize: can_update, + disable_v1: can_update && expected_status == RelSizeMigration::Migrated, }) } - (true, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { - // index_part already persisted that the timeline has enabled rel_size_v2 - // and we don't need to do anything + (RelSizeMigration::Migrating, current_status @ RelSizeMigration::Migrating) + | (RelSizeMigration::Migrated, current_status @ RelSizeMigration::Migrated) + | (RelSizeMigration::Migrating, current_status @ RelSizeMigration::Migrated) => { + // Keep the current state Ok(RelDirMode { - current_status: status, + current_status, initialize: false, + disable_v1: false, + }) + } + (RelSizeMigration::Migrated, RelSizeMigration::Migrating) => { + // Switch to v2-only mode + Ok(RelDirMode { + current_status: RelSizeMigration::Migrating, + initialize: false, + disable_v1: can_update, }) } } @@ -2142,14 +2162,14 @@ impl DatadirModification<'_> { img: Bytes, ctx: &RequestContext, ) -> Result<(), WalIngestError> { - let v2_mode = self - .maybe_enable_rel_size_v2(false) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; - // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; + let v2_mode = self + .maybe_enable_rel_size_v2(&dbdir, false) + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + let r = dbdir.dbdirs.insert((spcnode, dbnode), true); if r.is_none() || r == Some(false) { // The dbdir entry didn't exist, or it contained a @@ -2296,15 +2316,6 @@ impl DatadirModification<'_> { sparse_rel_dir_key, Value::Image(RelDirExists::Exists.encode()), ); - tracing::info!( - "migrated rel_size_v2: {}", - RelTag { - spcnode, - dbnode, - relnode, - forknum - } - ); rel_cnt += 1; } } @@ -2313,9 +2324,6 @@ impl DatadirModification<'_> { self.lsn, rel_cnt ); - self.tline - .update_rel_size_v2_status(RelSizeMigration::Migrating, Some(self.lsn)) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; Ok::<_, WalIngestError>(()) } @@ -2399,23 +2407,25 @@ impl DatadirModification<'_> { // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; + let mut is_dbdir_dirty = false; + let mut is_reldirv2_index_part_dirty = false; + let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), )); - self.put(DBDIR_KEY, Value::Image(buf.into())); + is_dbdir_dirty = true; false } else { true }; let mut v2_mode = self - .maybe_enable_rel_size_v2(true) + .maybe_enable_rel_size_v2(&dbdir, true) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_mode.initialize { @@ -2424,8 +2434,35 @@ impl DatadirModification<'_> { // TODO: circuit breaker so that it won't retry forever } else { v2_mode.current_status = RelSizeMigration::Migrating; + let migration_history = dbdir.rel_dir_migration_status.get_or_insert_default(); + migration_history.status = Some(RelSizeMigration::Migrating); + migration_history.v2_enabled_at = Some(self.lsn); + is_dbdir_dirty = true; + is_reldirv2_index_part_dirty = true; } } + if v2_mode.disable_v1 { + v2_mode.current_status = RelSizeMigration::Migrated; + let migration_history = dbdir.rel_dir_migration_status.get_or_insert_default(); + migration_history.status = Some(RelSizeMigration::Migrated); + migration_history.v1_disabled_at = Some(self.lsn); + is_dbdir_dirty = true; + is_reldirv2_index_part_dirty = true; + } + + if is_dbdir_dirty { + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } + + if is_reldirv2_index_part_dirty { + self.tline + .update_rel_size_v2_status( + dbdir.get_persistent_rel_size_v2_status(), + dbdir.get_persistent_rel_size_v2_migrated_at(), + ) + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + } if v2_mode.current_status != RelSizeMigration::Migrated { self.put_rel_creation_v1(rel, dbdir_exists, ctx).await?; @@ -2586,8 +2623,9 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> Result<(), WalIngestError> { + let dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let v2_mode = self - .maybe_enable_rel_size_v2(false) + .maybe_enable_rel_size_v2(&dbdir, false) .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; match v2_mode.current_status { RelSizeMigration::Legacy => { @@ -3161,10 +3199,33 @@ impl Version<'_> { //--- Metadata structs stored in key-value pairs in the repository. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub(crate) struct RelSizeMigrationHistory { + pub(crate) status: Option, + pub(crate) v2_enabled_at: Option, + pub(crate) v1_disabled_at: Option, +} + #[derive(Debug, Serialize, Deserialize)] pub(crate) struct DbDirectory { // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) pub(crate) dbdirs: HashMap<(Oid, Oid), bool>, + pub(crate) rel_dir_migration_status: Option, +} + +impl DbDirectory { + pub(crate) fn get_persistent_rel_size_v2_status(&self) -> RelSizeMigration { + self.rel_dir_migration_status + .as_ref() + .and_then(|x| x.status.clone()) + .unwrap_or(RelSizeMigration::Legacy) + } + + pub(crate) fn get_persistent_rel_size_v2_migrated_at(&self) -> Option { + self.rel_dir_migration_status + .as_ref() + .and_then(|x| x.v1_disabled_at) + } } // The format of TwoPhaseDirectory changed in PostgreSQL v17, because the filenames of diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 91b717a2e9..4a22f799ce 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5090,7 +5090,7 @@ impl TenantShard { src_timeline.pg_version, ); - let (rel_size_v2_status, rel_size_migrated_at) = src_timeline.get_rel_size_v2_status(); + let (rel_size_v2_status, rel_size_migrated_at) = src_timeline.get_rel_size_v2_cached_status(); let (uninitialized_timeline, _timeline_ctx) = self .prepare_new_timeline( dst_id, diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 9531e7f650..dfdfb0d835 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -83,6 +83,7 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, + /// Deprecated: the field is not used anymore and the source of truth is now stored in the dbdir key. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, @@ -115,8 +116,7 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) marked_invisible_at: Option, - /// The LSN at which we started the rel size migration. Accesses below this LSN should be - /// processed with the v1 read path. Usually this LSN should be set together with `rel_size_migration`. + /// Deprecated: the field is not used anymore and the source of truth is now stored in the dbdir key. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migrated_at: Option, } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2c70c5cfa5..8930b70799 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -441,7 +441,7 @@ pub struct Timeline { /// heatmap on demand. heatmap_layers_downloader: Mutex>, - pub(crate) rel_size_v2_status: ArcSwap<(Option, Option)>, + pub(crate) rel_size_v2_cached_status: ArcSwap<(Option, Option)>, wait_lsn_log_slow: tokio::sync::Semaphore, @@ -2883,19 +2883,33 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } - /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path - /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is - /// possible that the index part persists the state while the config doesn't get persisted. - pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + /// Returns the expected state of the rel size migration. The actual state is persisted in the + /// DbDir key. + /// + /// The expected state is the state that the tenant config expects. + pub(crate) fn get_rel_size_v2_expected_state(&self) -> RelSizeMigration { let tenant_conf = self.tenant_conf.load(); - tenant_conf + let v2_enabled = tenant_conf .tenant_conf .rel_size_v2_enabled - .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled); + let v1_access_disabled = tenant_conf + .tenant_conf + .rel_size_v1_access_disabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v1_access_disabled); + + match (v2_enabled, v1_access_disabled) { + (true, false) => RelSizeMigration::Migrating, + (true, true) => RelSizeMigration::Migrated, + (false, true) => RelSizeMigration::Legacy, // This should never happen + (false, false) => RelSizeMigration::Legacy, + } } - pub(crate) fn get_rel_size_v2_status(&self) -> (RelSizeMigration, Option) { - let (status, migrated_at) = self.rel_size_v2_status.load().as_ref().clone(); + /// DO NOT use this API in the read/write path to determine the rel size migration status. The source of truth is the dbdir key. + /// This API is only used for the timeline info struct to get the latest cached status. + pub(crate) fn get_rel_size_v2_cached_status(&self) -> (RelSizeMigration, Option) { + let (status, migrated_at) = self.rel_size_v2_cached_status.load().as_ref().clone(); (status.unwrap_or(RelSizeMigration::Legacy), migrated_at) } @@ -3336,7 +3350,7 @@ impl Timeline { heatmap_layers_downloader: Mutex::new(None), - rel_size_v2_status: ArcSwap::from_pointee(( + rel_size_v2_cached_status: ArcSwap::from_pointee(( rel_size_v2_status, rel_size_migrated_at, )), @@ -3429,10 +3443,11 @@ impl Timeline { rel_size_v2_status: RelSizeMigration, rel_size_migrated_at: Option, ) -> anyhow::Result<()> { - self.rel_size_v2_status.store(Arc::new(( + self.rel_size_v2_cached_status.store(Arc::new(( Some(rel_size_v2_status.clone()), rel_size_migrated_at, ))); + // The index_part upload is not used as source of truth anymore, but we still need to upload it to make it work across branches. self.remote_client .schedule_index_upload_for_rel_size_v2_status_update( rel_size_v2_status, diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index d471e9fc69..c9eefca2a5 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -177,6 +177,7 @@ impl Planner { .iter() .map(|db| ((db.spcnode, db.dboid), true)) .collect(), + rel_dir_migration_status: None, })?); self.tasks .push(ImportSingleKeyTask::new(DBDIR_KEY, dbdir_buf).into());