From f6b18915647f47ffaf95093fe9137c088da7e4a9 Mon Sep 17 00:00:00 2001 From: Alex Chi Z Date: Mon, 28 Jul 2025 16:58:18 -0400 Subject: [PATCH] feat(pageserver): enable reldirv2 by default in regress tests, try 2 Signed-off-by: Alex Chi Z --- control_plane/src/pageserver.rs | 5 + libs/pageserver_api/src/config.rs | 4 + libs/pageserver_api/src/models.rs | 18 ++- pageserver/src/pgdatadir_mapping.rs | 137 ++++++++++++------ .../tenant/remote_timeline_client/index.rs | 4 +- pageserver/src/tenant/timeline.rs | 25 +++- pageserver/src/walingest.rs | 2 +- test_runner/fixtures/neon_fixtures.py | 4 +- .../performance/test_perf_many_relations.py | 11 +- .../regress/test_attach_tenant_config.py | 3 +- test_runner/regress/test_compatibility.py | 10 +- test_runner/regress/test_pg_regress.py | 5 +- 12 files changed, 161 insertions(+), 67 deletions(-) diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 843ead807d..887bfc2c76 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -571,6 +571,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'basebackup_cache_enabled' as bool")?, + rel_size_v1_access_disabled: settings + .remove("rel_size_v1_access_disabled") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'rel_size_v1_access_disabled' as bool")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 2a8d05f51e..35f33621d7 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -651,6 +651,9 @@ pub struct TenantConfigToml { // FIXME: Remove skip_serializing_if when the feature is stable. #[serde(skip_serializing_if = "std::ops::Not::not")] pub basebackup_cache_enabled: bool, + + #[serde(skip_serializing_if = "std::ops::Not::not")] + pub rel_size_v1_access_disabled: bool, } pub mod defaults { @@ -959,6 +962,7 @@ impl Default for TenantConfigToml { sampling_ratio: None, relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY, basebackup_cache_enabled: false, + rel_size_v1_access_disabled: false, } } } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 230c1f46ea..67d4685183 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -646,6 +646,8 @@ pub struct TenantConfigPatch { pub relsize_snapshot_cache_capacity: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub basebackup_cache_enabled: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub rel_size_v1_access_disabled: FieldPatch, } /// Like [`crate::config::TenantConfigToml`], but preserves the information @@ -783,6 +785,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub basebackup_cache_enabled: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + pub rel_size_v1_access_disabled: Option, } impl TenantConfig { @@ -830,6 +835,7 @@ impl TenantConfig { mut sampling_ratio, mut relsize_snapshot_cache_capacity, mut basebackup_cache_enabled, + mut rel_size_v1_access_disabled, } = self; patch.checkpoint_distance.apply(&mut checkpoint_distance); @@ -939,6 +945,9 @@ impl TenantConfig { patch .basebackup_cache_enabled .apply(&mut basebackup_cache_enabled); + patch + .rel_size_v1_access_disabled + .apply(&mut rel_size_v1_access_disabled); Ok(Self { checkpoint_distance, @@ -980,6 +989,7 @@ impl TenantConfig { sampling_ratio, relsize_snapshot_cache_capacity, basebackup_cache_enabled, + rel_size_v1_access_disabled, }) } @@ -1094,6 +1104,9 @@ impl TenantConfig { basebackup_cache_enabled: self .basebackup_cache_enabled .unwrap_or(global_conf.basebackup_cache_enabled), + rel_size_v1_access_disabled: self + .rel_size_v1_access_disabled + .unwrap_or(global_conf.rel_size_v1_access_disabled), } } } @@ -1526,12 +1539,15 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +/// The state of the rel size migration. This is persisted in the DbDir key and index part. Do not change without considering +/// compatibility. +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub enum RelSizeMigration { /// The tenant is using the old rel_size format. /// Note that this enum is persisted as `Option` in the index part, so /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + #[default] Legacy, /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are /// persisted in the storage. The read path will read both formats and validate them. diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index cedf77fb37..a45d560aaf 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -720,9 +720,12 @@ impl Timeline { "inconsistent v1/v2 reldir keyspace for rel {}: v1_exists={}, v2_exists={}", tag, v1_exists, - v2_exists + v2_exists, ); } + Err(e) if e.is_cancel() => { + // Cancellation errors are fine to ignore, do not log. + } Err(e) => { tracing::warn!("failed to get rel exists in v2: {e}"); } @@ -811,11 +814,11 @@ impl Timeline { forknum: key.field5, }; if val == RelDirExists::Removed { - debug_assert!(!rels.contains(&tag), "removed reltag in v2"); + debug_assert!(!rels.contains(&tag), "removed reltag in v2: {tag}"); continue; } let did_not_contain = rels.insert(tag); - debug_assert!(did_not_contain, "duplicate reltag in v2"); + debug_assert!(did_not_contain, "duplicate reltag in v2: {tag}"); } Ok(rels) } @@ -863,6 +866,9 @@ impl Timeline { rels_v2.len() ); } + Err(e) if e.is_cancel() => { + // Cancellation errors are fine to ignore, do not log. + } Err(e) => { tracing::warn!("failed to list rels in v2: {e}"); } @@ -1724,6 +1730,8 @@ pub struct RelDirMode { current_status: RelSizeMigration, // Whether we should initialize the v2 keyspace or not. initialize: bool, + // Whether we should disable v1 access starting this LSNor not + disable_v1: bool, } impl DatadirModification<'_> { @@ -2085,44 +2093,82 @@ impl DatadirModification<'_> { /// /// As this function is only used on the write path, we do not need to read the migrated_at /// field. - pub fn maybe_enable_rel_size_v2(&mut self, is_create: bool) -> anyhow::Result { + pub(crate) fn maybe_enable_rel_size_v2( + &mut self, + lsn: Lsn, + is_create: bool, + ) -> anyhow::Result { // TODO: define the behavior of the tenant-level config flag and use feature flag to enable this feature + let expected_status = self.tline.get_rel_size_v2_expected_state(); + let (persistent_status, migrated_at) = self.tline.get_rel_size_v2_status(); - let (status, _) = self.tline.get_rel_size_v2_status(); - let config = self.tline.get_rel_size_v2_enabled(); - match (config, status) { - (false, RelSizeMigration::Legacy) => { + // migrated_at LSN means the LSN where we "enable_v2" (but not "disable_v1") + if let Some(migrated_at) = migrated_at + && lsn <= migrated_at + && persistent_status == RelSizeMigration::Migrating + { + // Revert the status to legacy if the write head LSN is before the migrating LSN. + self.tline + .update_rel_size_v2_status(RelSizeMigration::Legacy, None)?; + } + // TODO: what if the migration is at "migrated" state but we need to revert? + + // Only initialize the v2 keyspace on new relation creation. No initialization + // during `timeline_create` (TODO: fix this, we should allow, but currently it + // hits expected consistency issues; need to ignore warnings). + let can_update = is_create && !self.is_importing_pgdata; + + match (expected_status, persistent_status) { + (RelSizeMigration::Legacy, RelSizeMigration::Legacy) => { // tenant config didn't enable it and we didn't write any reldir_v2 key yet Ok(RelDirMode { current_status: RelSizeMigration::Legacy, initialize: false, + disable_v1: false, }) } - (false, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { - // index_part already persisted that the timeline has enabled rel_size_v2 + ( + RelSizeMigration::Legacy, + current_status @ RelSizeMigration::Migrating + | current_status @ RelSizeMigration::Migrated, + ) => { + // already persisted that the timeline has enabled rel_size_v2, cannot rollback Ok(RelDirMode { - current_status: status, + current_status, initialize: false, + disable_v1: false, }) } - (true, RelSizeMigration::Legacy) => { + ( + expected_status @ RelSizeMigration::Migrating + | expected_status @ RelSizeMigration::Migrated, + RelSizeMigration::Legacy, + ) => { // The first time we enable it, we need to persist it in `index_part.json` // The caller should update the reldir status once the initialization is done. - // - // Only initialize the v2 keyspace on new relation creation. No initialization - // during `timeline_create` (TODO: fix this, we should allow, but currently it - // hits consistency issues). + Ok(RelDirMode { current_status: RelSizeMigration::Legacy, - initialize: is_create && !self.is_importing_pgdata, + initialize: can_update, + disable_v1: can_update && expected_status == RelSizeMigration::Migrated, }) } - (true, status @ RelSizeMigration::Migrating | status @ RelSizeMigration::Migrated) => { - // index_part already persisted that the timeline has enabled rel_size_v2 - // and we don't need to do anything + (RelSizeMigration::Migrating, current_status @ RelSizeMigration::Migrating) + | (RelSizeMigration::Migrated, current_status @ RelSizeMigration::Migrated) + | (RelSizeMigration::Migrating, current_status @ RelSizeMigration::Migrated) => { + // Keep the current state Ok(RelDirMode { - current_status: status, + current_status, initialize: false, + disable_v1: false, + }) + } + (RelSizeMigration::Migrated, RelSizeMigration::Migrating) => { + // Switch to v2-only mode + Ok(RelDirMode { + current_status: RelSizeMigration::Migrating, + initialize: false, + disable_v1: can_update, }) } } @@ -2137,8 +2183,8 @@ impl DatadirModification<'_> { ctx: &RequestContext, ) -> Result<(), WalIngestError> { let v2_mode = self - .maybe_enable_rel_size_v2(false) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(self.lsn, false) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; @@ -2290,15 +2336,6 @@ impl DatadirModification<'_> { sparse_rel_dir_key, Value::Image(RelDirExists::Exists.encode()), ); - tracing::info!( - "migrated rel_size_v2: {}", - RelTag { - spcnode, - dbnode, - relnode, - forknum - } - ); rel_cnt += 1; } } @@ -2307,9 +2344,6 @@ impl DatadirModification<'_> { self.lsn, rel_cnt ); - self.tline - .update_rel_size_v2_status(RelSizeMigration::Migrating, Some(self.lsn)) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; Ok::<_, WalIngestError>(()) } @@ -2392,25 +2426,25 @@ impl DatadirModification<'_> { // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; + let mut is_dbdir_dirty = false; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), )); - self.put(DBDIR_KEY, Value::Image(buf.into())); + is_dbdir_dirty = true; false } else { true }; let mut v2_mode = self - .maybe_enable_rel_size_v2(true) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(self.lsn, true) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; if v2_mode.initialize { if let Err(e) = self.initialize_rel_size_v2_keyspace(ctx, &dbdir).await { @@ -2418,9 +2452,24 @@ impl DatadirModification<'_> { // TODO: circuit breaker so that it won't retry forever } else { v2_mode.current_status = RelSizeMigration::Migrating; + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrating, Some(self.lsn)) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; } } + if v2_mode.disable_v1 { + v2_mode.current_status = RelSizeMigration::Migrated; + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrated, Some(self.lsn)) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; + } + + if is_dbdir_dirty { + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } + if v2_mode.current_status != RelSizeMigration::Migrated { self.put_rel_creation_v1(rel, dbdir_exists, ctx).await?; } @@ -2581,8 +2630,8 @@ impl DatadirModification<'_> { ctx: &RequestContext, ) -> Result<(), WalIngestError> { let v2_mode = self - .maybe_enable_rel_size_v2(false) - .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; + .maybe_enable_rel_size_v2(self.lsn, false) + .map_err(WalIngestErrorKind::RelSizeV2Error)?; match v2_mode.current_status { RelSizeMigration::Legacy => { self.put_rel_drop_v1(drop_relations, ctx).await?; @@ -2600,6 +2649,12 @@ impl DatadirModification<'_> { ); } } + Err(WalIngestError { + kind: WalIngestErrorKind::Cancelled, + .. + }) => { + // Cancellation errors are fine to ignore, do not log. + } Err(e) => { tracing::warn!("error dropping rels: {}", e); } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 9531e7f650..dfdfb0d835 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -83,6 +83,7 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) last_aux_file_policy: Option, + /// Deprecated: the field is not used anymore and the source of truth is now stored in the dbdir key. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migration: Option, @@ -115,8 +116,7 @@ pub struct IndexPart { #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) marked_invisible_at: Option, - /// The LSN at which we started the rel size migration. Accesses below this LSN should be - /// processed with the v1 read path. Usually this LSN should be set together with `rel_size_migration`. + /// Deprecated: the field is not used anymore and the source of truth is now stored in the dbdir key. #[serde(skip_serializing_if = "Option::is_none", default)] pub(crate) rel_size_migrated_at: Option, } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2c70c5cfa5..ce0ae18ff9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2883,15 +2883,27 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } - /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path - /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is - /// possible that the index part persists the state while the config doesn't get persisted. - pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { + /// Returns the expected state of the rel size migration. The actual state is persisted in the + /// DbDir key. + /// + /// The expected state is the state that the tenant config expects. + pub(crate) fn get_rel_size_v2_expected_state(&self) -> RelSizeMigration { let tenant_conf = self.tenant_conf.load(); - tenant_conf + let v2_enabled = tenant_conf .tenant_conf .rel_size_v2_enabled - .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) + .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled); + let v1_access_disabled = tenant_conf + .tenant_conf + .rel_size_v1_access_disabled + .unwrap_or(self.conf.default_tenant_conf.rel_size_v1_access_disabled); + + match (v2_enabled, v1_access_disabled) { + (true, false) => RelSizeMigration::Migrating, + (true, true) => RelSizeMigration::Migrated, + (false, true) => RelSizeMigration::Legacy, // This should never happen + (false, false) => RelSizeMigration::Legacy, + } } pub(crate) fn get_rel_size_v2_status(&self) -> (RelSizeMigration, Option) { @@ -3433,6 +3445,7 @@ impl Timeline { Some(rel_size_v2_status.clone()), rel_size_migrated_at, ))); + // The index_part upload is not used as source of truth anymore, but we still need to upload it to make it work across branches. self.remote_client .schedule_index_upload_for_rel_size_v2_status_update( rel_size_v2_status, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3acf98b020..6a7f126daf 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -135,7 +135,7 @@ pub enum WalIngestErrorKind { #[error(transparent)] EncodeAuxFileError(anyhow::Error), #[error(transparent)] - MaybeRelSizeV2Error(anyhow::Error), + RelSizeV2Error(anyhow::Error), #[error("timeline shutting down")] Cancelled, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7f59547c73..6630c2716c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1312,8 +1312,8 @@ class NeonEnv: ) tenant_config = ps_cfg.setdefault("tenant_config", {}) - # This feature is pending rollout. - # tenant_config["rel_size_v2_enabled"] = True + # Enable relsize_v2 by default in tests. + tenant_config["rel_size_v2_enabled"] = True # Test authors tend to forget about the default 10min initial lease deadline # when writing tests, which turns their immediate gc requests via mgmt API diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 9204a6a740..87316b28e5 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -80,7 +80,12 @@ def test_perf_simple_many_relations_reldir( """ Test creating many relations in a single database. """ - env = neon_env_builder.init_start(initial_tenant_conf={"rel_size_v2_enabled": reldir != "v1"}) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "rel_size_v2_enabled": reldir != "v1", + "rel_size_v1_access_disabled": reldir == "v2", + } + ) ep = env.endpoints.create_start( "main", config_lines=[ @@ -108,10 +113,6 @@ def test_perf_simple_many_relations_reldir( == "migrating" ) elif reldir == "v2": - # only read/write to the v2 keyspace - env.pageserver.http_client().timeline_patch_index_part( - env.initial_tenant, env.initial_timeline, {"rel_size_migration": "migrated"} - ) assert ( env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ "rel_size_migration" diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index eaaa3014a5..3a90a72a98 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -183,7 +183,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", "timeline_offloading": False, - "rel_size_v2_enabled": True, + "rel_size_v2_enabled": False, "relsize_snapshot_cache_capacity": 10000, "gc_compaction_enabled": False, "gc_compaction_verification": False, @@ -194,6 +194,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "numerator": 0, "denominator": 10, }, + "rel_size_v1_access_disabled": True, } vps_http = env.storage_controller.pageserver_api() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 635a040800..539bbcaab8 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -581,12 +581,10 @@ def test_historic_storage_formats( # This dataset was created at a time where we decided to migrate to v2 reldir by simply disabling writes to v1 # and starting writing to v2. This was too risky and we have reworked the migration plan. Therefore, we should # opt in full relv2 mode for this dataset. - for timeline in timelines: - env.pageserver.http_client().timeline_patch_index_part( - dataset.tenant_id, - timeline["timeline_id"], - {"force_index_update": True, "rel_size_migration": "migrated"}, - ) + env.pageserver.http_client().patch_tenant_config( + dataset.tenant_id, + {"rel_size_v1_access_disabled": True, "rel_size_v2_enabled": True}, + ) # Import tenant does not create the timeline on safekeepers, # because it is a debug handler and the timeline may have already been diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index cc7f736239..c2a7c02f2f 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -123,9 +123,10 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]: tenant_conf = tenant_conf.copy() if reldir_type == "v2": - tenant_conf["rel_size_v2_enabled"] = "true" + tenant_conf["rel_size_v2_enabled"] = True + tenant_conf["rel_size_v1_access_disabled"] = True else: - tenant_conf["rel_size_v2_enabled"] = "false" + tenant_conf["rel_size_v2_enabled"] = False return tenant_conf