From 6d0976dad5517531a2163ddd67e3d1e1b9cd9756 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 3 Mar 2025 16:05:43 -0500 Subject: [PATCH] feat(pageserver): persist reldir v2 migration status (#10980) ## Problem part of https://github.com/neondatabase/neon/issues/9516 ## Summary of changes Similar to the aux v2 migration, we persist the relv2 migration status into index_part, so that even the config item is set to false, we will still read from the v2 storage to avoid loss of data. Note that only the two variants `None` and `Some(RelSizeMigration::Migrating)` are used for now. We don't have full migration implemented so it will never be set to `RelSizeMigration::Migrated`. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 19 ++++++++ pageserver/src/http/routes.rs | 1 + pageserver/src/pgdatadir_mapping.rs | 47 +++++++++++++++++-- pageserver/src/tenant.rs | 6 ++- .../src/tenant/remote_timeline_client.rs | 19 +++++++- .../tenant/remote_timeline_client/index.rs | 16 +------ pageserver/src/tenant/timeline.rs | 28 ++++++++++- pageserver/src/tenant/timeline/delete.rs | 1 + .../performance/test_perf_many_relations.py | 7 +++ test_runner/regress/test_pg_regress.py | 15 ++++++ test_runner/regress/test_relations.py | 44 ++++++++++++++++- 11 files changed, 178 insertions(+), 25 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ea565e7769..fabfe28aa2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo { pub archived_at: chrono::DateTime, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum RelSizeMigration { + /// The tenant is using the old rel_size format. + /// Note that this enum is persisted as `Option` in the index part, so + /// `None` is the same as `Some(RelSizeMigration::Legacy)`. + Legacy, + /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are + /// persisted in the index part. The read path will read both formats and merge them. + Migrating, + /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted + /// in the index part, and the read path will not read the old format. + Migrated, +} + /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { @@ -1243,7 +1258,11 @@ pub struct TimelineInfo { // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does // not deny unknown fields by default so it's safe to set the field to some value, though it won't be // read. + /// Whether the timeline is archived. pub is_archived: Option, + + /// The status of the rel_size migration. + pub rel_size_migration: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index a3ee31d6e6..cd79aa6680 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -481,6 +481,7 @@ async fn build_timeline_info_common( state, is_archived: Some(is_archived), + rel_size_migration: Some(timeline.get_rel_size_v2_status()), walreceiver_status, }; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c10dfb4542..8aa96dd672 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -21,6 +21,7 @@ use pageserver_api::key::{ slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::models::RelSizeMigration; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -492,7 +493,9 @@ impl Timeline { // Otherwise, read the old reldir keyspace. // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2. - if self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Migrated | RelSizeMigration::Migrating = + self.get_rel_size_v2_status() + { // fetch directory listing (new) let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum); let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?) @@ -544,7 +547,7 @@ impl Timeline { forknum: *forknum, })); - if !self.get_rel_size_v2_enabled() { + if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() { return Ok(rels_v1); } @@ -1720,6 +1723,35 @@ impl DatadirModification<'_> { Ok(()) } + /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that + /// we enable it, we also need to persist it in `index_part.json`. + pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result { + let status = self.tline.get_rel_size_v2_status(); + let config = self.tline.get_rel_size_v2_enabled(); + match (config, status) { + (false, RelSizeMigration::Legacy) => { + // tenant config didn't enable it and we didn't write any reldir_v2 key yet + Ok(false) + } + (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + Ok(true) + } + (true, RelSizeMigration::Legacy) => { + // The first time we enable it, we need to persist it in `index_part.json` + self.tline + .update_rel_size_v2_status(RelSizeMigration::Migrating)?; + tracing::info!("enabled rel_size_v2"); + Ok(true) + } + (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => { + // index_part already persisted that the timeline has enabled rel_size_v2 + // and we don't need to do anything + Ok(true) + } + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1728,6 +1760,8 @@ impl DatadirModification<'_> { img: Bytes, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; + // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; let mut dbdir = DbDirectory::des(&buf)?; @@ -1748,7 +1782,7 @@ impl DatadirModification<'_> { })?; self.pending_directory_entries .push((DirectoryKind::Rel, MetricsUpdate::Set(0))); - if self.tline.get_rel_size_v2_enabled() { + if v2_enabled { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Set(0))); } @@ -1905,7 +1939,9 @@ impl DatadirModification<'_> { return Err(RelationError::AlreadyExists); } - if self.tline.get_rel_size_v2_enabled() { + let v2_enabled = self.maybe_enable_rel_size_v2()?; + + if v2_enabled { let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 @@ -2031,6 +2067,7 @@ impl DatadirModification<'_> { drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, ) -> anyhow::Result<()> { + let v2_enabled = self.maybe_enable_rel_size_v2()?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2043,7 +2080,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Sub(1))); dirty = true; true - } else if self.tline.get_rel_size_v2_enabled() { + } else if v2_enabled { // The rel is not found in the old reldir key, so we need to check the new sparse keyspace. // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion // logic). diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 776e523c2e..fee007b2d7 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -31,8 +31,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use itertools::Itertools as _; use once_cell::sync::Lazy; -use pageserver_api::models; pub use pageserver_api::models::TenantState; +use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, @@ -1123,6 +1123,7 @@ impl Tenant { CreateTimelineCause::Load, idempotency.clone(), index_part.gc_compaction.clone(), + index_part.rel_size_migration.clone(), )?; let disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -4128,6 +4129,7 @@ impl Tenant { cause: CreateTimelineCause, create_idempotency: CreateTimelineIdempotency, gc_compaction_state: Option, + rel_size_v2_status: Option, ) -> anyhow::Result> { let state = match cause { CreateTimelineCause::Load => { @@ -4160,6 +4162,7 @@ impl Tenant { self.attach_wal_lag_cooldown.clone(), create_idempotency, gc_compaction_state, + rel_size_v2_status, self.cancel.child_token(), ); @@ -5231,6 +5234,7 @@ impl Tenant { CreateTimelineCause::Load, create_guard.idempotency.clone(), None, + None, ) .context("Failed to create timeline data structure")?; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 4ba5844fea..2ca482ca43 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -194,7 +194,7 @@ pub(crate) use download::{ }; use index::GcCompactionState; pub(crate) use index::LayerFileMetadata; -use pageserver_api::models::TimelineArchivalState; +use pageserver_api::models::{RelSizeMigration, TimelineArchivalState}; use pageserver_api::shard::{ShardIndex, TenantShardId}; use regex::Regex; use remote_storage::{ @@ -900,7 +900,7 @@ impl RemoteTimelineClient { Ok(()) } - /// Launch an index-file upload operation in the background, setting `import_pgdata` field. + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, gc_compaction_state: GcCompactionState, @@ -912,6 +912,21 @@ impl RemoteTimelineClient { Ok(()) } + /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field. + pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update( + self: &Arc, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status); + // TODO: allow this operation to bypass the validation check because we might upload the index part + // with no layers but the flag updated. For now, we just modify the index part in memory and the next + // upload will include the flag. + // self.schedule_index_upload(upload_queue); + Ok(()) + } + /// /// Launch an index-file upload operation in the background, if necessary. /// diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index ceaed58bbd..16c38be907 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -7,6 +7,7 @@ use std::collections::HashMap; use chrono::NaiveDateTime; use pageserver_api::models::AuxFilePolicy; +use pageserver_api::models::RelSizeMigration; use pageserver_api::shard::ShardIndex; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; @@ -117,21 +118,6 @@ pub struct GcCompactionState { pub(crate) last_completed_lsn: Lsn, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub enum RelSizeMigration { - /// The tenant is using the old rel_size format. - /// Note that this enum is persisted as `Option` in the index part, so - /// `None` is the same as `Some(RelSizeMigration::Legacy)`. - Legacy, - /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are - /// persisted in the index part. The read path will read both formats and merge them. - Migrating, - /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted - /// in the index part, and the read path will not read the old format. - Migrated, -} - impl IndexPart { /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be /// used to understand later versions. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 17dbcee74e..7ed7910732 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState, + InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; @@ -436,6 +436,8 @@ pub struct Timeline { /// May host a background Tokio task which downloads all the layers from the current /// heatmap on demand. heatmap_layers_downloader: Mutex>, + + pub(crate) rel_size_v2_status: ArcSwapOption, } pub(crate) enum PreviousHeatmap { @@ -2368,6 +2370,9 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } + /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path + /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is + /// possible that the index part persists the state while the config doesn't get persisted. pub(crate) fn get_rel_size_v2_enabled(&self) -> bool { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2376,6 +2381,14 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled) } + pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration { + self.rel_size_v2_status + .load() + .as_ref() + .map(|s| s.as_ref().clone()) + .unwrap_or(RelSizeMigration::Legacy) + } + fn get_compaction_upper_limit(&self) -> usize { let tenant_conf = self.tenant_conf.load(); tenant_conf @@ -2636,6 +2649,7 @@ impl Timeline { attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, gc_compaction_state: Option, + rel_size_v2_status: Option, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2794,6 +2808,8 @@ impl Timeline { previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap), heatmap_layers_downloader: Mutex::new(None), + + rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), }; result.repartition_threshold = @@ -2870,6 +2886,16 @@ impl Timeline { .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state) } + pub(crate) fn update_rel_size_v2_status( + &self, + rel_size_v2_status: RelSizeMigration, + ) -> anyhow::Result<()> { + self.rel_size_v2_status + .store(Some(Arc::new(rel_size_v2_status.clone()))); + self.remote_client + .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status) + } + pub(crate) fn get_gc_compaction_state(&self) -> Option { self.gc_compaction_state.load_full().as_ref().clone() } diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 7cdc69e55f..c9666bb4e1 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -306,6 +306,7 @@ impl DeleteTimelineFlow { CreateTimelineCause::Delete, crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here None, // doesn't matter what we put here + None, // doesn't matter what we put here ) .context("create_timeline_struct")?; diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py index 2570c55f6c..e2f0a79018 100644 --- a/test_runner/performance/test_perf_many_relations.py +++ b/test_runner/performance/test_perf_many_relations.py @@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2( ], ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + n = 100000 step = 5000 # Create many relations diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 6a76ad5ca8..df243c13f1 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -358,6 +358,21 @@ def test_tx_abort_with_many_relations( ], ) + if reldir_type == "v1": + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + else: + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + != "legacy" + ) + # How many relations: this number is tuned to be long enough to take tens of seconds # if the rollback code path is buggy, tripping the test's timeout. if reldir_type == "v1": diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py index 3e29c92a96..07eacfc775 100644 --- a/test_runner/regress/test_relations.py +++ b/test_runner/regress/test_relations.py @@ -19,6 +19,17 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)") endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)") + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + + # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do + # a "wait_flush_lsn" here, but it's easier to just do a restart. + env.pageserver.restart() + # Switch to v2 env.pageserver.http_client().update_tenant_config( env.initial_tenant, @@ -27,6 +38,13 @@ def test_pageserver_reldir_v2( }, ) + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "legacy" + ) + # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo1") endpoint.safe_psql("SELECT * FROM foo2") @@ -41,12 +59,14 @@ def test_pageserver_reldir_v2( # Create a relation in v2 endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)") + endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)") # Delete a relation in v1 endpoint.safe_psql("DROP TABLE foo1") # Check if both relations are still accessible endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") + endpoint.safe_psql("SELECT * FROM foo4") # Restart the endpoint endpoint.stop() @@ -57,7 +77,7 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("SELECT * FROM foo3") - + endpoint.safe_psql("SELECT * FROM foo4") endpoint.safe_psql("DROP TABLE foo3") endpoint.stop() endpoint.start() @@ -66,3 +86,25 @@ def test_pageserver_reldir_v2( endpoint.safe_psql("DROP TABLE IF EXISTS foo1") endpoint.safe_psql("SELECT * FROM foo2") endpoint.safe_psql("DROP TABLE IF EXISTS foo3") + endpoint.safe_psql("SELECT * FROM foo4") + + # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached. + env.pageserver.http_client().update_tenant_config( + env.initial_tenant, + { + "rel_size_v2_enabled": False, + }, + ) + + # Check if the relation is still accessible + endpoint.safe_psql("SELECT * FROM foo2") + endpoint.safe_psql("SELECT * FROM foo4") + + env.pageserver.restart() + + assert ( + env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[ + "rel_size_migration" + ] + == "migrating" + )