feat(pageserver): persist reldir v2 migration status (#10980)

## Problem

part of https://github.com/neondatabase/neon/issues/9516

## Summary of changes

Similar to the aux v2 migration, we persist the relv2 migration status
into index_part, so that even the config item is set to false, we will
still read from the v2 storage to avoid loss of data.

Note that only the two variants `None` and
`Some(RelSizeMigration::Migrating)` are used for now. We don't have full
migration implemented so it will never be set to
`RelSizeMigration::Migrated`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-03-03 16:05:43 -05:00
committed by GitHub
parent dbf9a80261
commit 6d0976dad5
11 changed files with 178 additions and 25 deletions

View File

@@ -1165,6 +1165,21 @@ pub struct OffloadedTimelineInfo {
pub archived_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -1243,7 +1258,11 @@ pub struct TimelineInfo {
// Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
// not deny unknown fields by default so it's safe to set the field to some value, though it won't be
// read.
/// Whether the timeline is archived.
pub is_archived: Option<bool>,
/// The status of the rel_size migration.
pub rel_size_migration: Option<RelSizeMigration>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -481,6 +481,7 @@ async fn build_timeline_info_common(
state,
is_archived: Some(is_archived),
rel_size_migration: Some(timeline.get_rel_size_v2_status()),
walreceiver_status,
};

View File

@@ -21,6 +21,7 @@ use pageserver_api::key::{
slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
};
use pageserver_api::keyspace::SparseKeySpace;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::record::NeonWalRecord;
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
// Otherwise, read the old reldir keyspace.
// TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
if self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
self.get_rel_size_v2_status()
{
// fetch directory listing (new)
let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
forknum: *forknum,
}));
if !self.get_rel_size_v2_enabled() {
if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
return Ok(rels_v1);
}
@@ -1720,6 +1723,35 @@ impl DatadirModification<'_> {
Ok(())
}
/// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
/// we enable it, we also need to persist it in `index_part.json`.
pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
let status = self.tline.get_rel_size_v2_status();
let config = self.tline.get_rel_size_v2_enabled();
match (config, status) {
(false, RelSizeMigration::Legacy) => {
// tenant config didn't enable it and we didn't write any reldir_v2 key yet
Ok(false)
}
(false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
Ok(true)
}
(true, RelSizeMigration::Legacy) => {
// The first time we enable it, we need to persist it in `index_part.json`
self.tline
.update_rel_size_v2_status(RelSizeMigration::Migrating)?;
tracing::info!("enabled rel_size_v2");
Ok(true)
}
(true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
// index_part already persisted that the timeline has enabled rel_size_v2
// and we don't need to do anything
Ok(true)
}
}
}
/// Store a relmapper file (pg_filenode.map) in the repository
pub async fn put_relmap_file(
&mut self,
@@ -1728,6 +1760,8 @@ impl DatadirModification<'_> {
img: Bytes,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
// Add it to the directory (if it doesn't exist already)
let buf = self.get(DBDIR_KEY, ctx).await?;
let mut dbdir = DbDirectory::des(&buf)?;
@@ -1748,7 +1782,7 @@ impl DatadirModification<'_> {
})?;
self.pending_directory_entries
.push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
if self.tline.get_rel_size_v2_enabled() {
if v2_enabled {
self.pending_directory_entries
.push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
}
@@ -1905,7 +1939,9 @@ impl DatadirModification<'_> {
return Err(RelationError::AlreadyExists);
}
if self.tline.get_rel_size_v2_enabled() {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
if v2_enabled {
let sparse_rel_dir_key =
rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
// check if the rel_dir_key exists in v2
@@ -2031,6 +2067,7 @@ impl DatadirModification<'_> {
drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let v2_enabled = self.maybe_enable_rel_size_v2()?;
for ((spc_node, db_node), rel_tags) in drop_relations {
let dir_key = rel_dir_to_key(spc_node, db_node);
let buf = self.get(dir_key, ctx).await?;
@@ -2043,7 +2080,7 @@ impl DatadirModification<'_> {
.push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
dirty = true;
true
} else if self.tline.get_rel_size_v2_enabled() {
} else if v2_enabled {
// The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
// Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
// logic).

View File

@@ -31,8 +31,8 @@ use futures::StreamExt;
use futures::stream::FuturesUnordered;
use itertools::Itertools as _;
use once_cell::sync::Lazy;
use pageserver_api::models;
pub use pageserver_api::models::TenantState;
use pageserver_api::models::{self, RelSizeMigration};
use pageserver_api::models::{
CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
WalRedoManagerStatus,
@@ -1123,6 +1123,7 @@ impl Tenant {
CreateTimelineCause::Load,
idempotency.clone(),
index_part.gc_compaction.clone(),
index_part.rel_size_migration.clone(),
)?;
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
@@ -4128,6 +4129,7 @@ impl Tenant {
cause: CreateTimelineCause,
create_idempotency: CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
rel_size_v2_status: Option<RelSizeMigration>,
) -> anyhow::Result<Arc<Timeline>> {
let state = match cause {
CreateTimelineCause::Load => {
@@ -4160,6 +4162,7 @@ impl Tenant {
self.attach_wal_lag_cooldown.clone(),
create_idempotency,
gc_compaction_state,
rel_size_v2_status,
self.cancel.child_token(),
);
@@ -5231,6 +5234,7 @@ impl Tenant {
CreateTimelineCause::Load,
create_guard.idempotency.clone(),
None,
None,
)
.context("Failed to create timeline data structure")?;

View File

@@ -194,7 +194,7 @@ pub(crate) use download::{
};
use index::GcCompactionState;
pub(crate) use index::LayerFileMetadata;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use regex::Regex;
use remote_storage::{
@@ -900,7 +900,7 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `import_pgdata` field.
/// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
self: &Arc<Self>,
gc_compaction_state: GcCompactionState,
@@ -912,6 +912,21 @@ impl RemoteTimelineClient {
Ok(())
}
/// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
self: &Arc<Self>,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
// TODO: allow this operation to bypass the validation check because we might upload the index part
// with no layers but the flag updated. For now, we just modify the index part in memory and the next
// upload will include the flag.
// self.schedule_index_upload(upload_queue);
Ok(())
}
///
/// Launch an index-file upload operation in the background, if necessary.
///

View File

@@ -7,6 +7,7 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::RelSizeMigration;
use pageserver_api::shard::ShardIndex;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
pub(crate) last_completed_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum RelSizeMigration {
/// The tenant is using the old rel_size format.
/// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
/// `None` is the same as `Some(RelSizeMigration::Legacy)`.
Legacy,
/// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
/// persisted in the index part. The read path will read both formats and merge them.
Migrating,
/// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
/// in the index part, and the read path will not read the old format.
Migrated,
}
impl IndexPart {
/// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
/// used to understand later versions.

View File

@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
use pageserver_api::models::{
CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
};
use pageserver_api::reltag::{BlockNumber, RelTag};
use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -436,6 +436,8 @@ pub struct Timeline {
/// May host a background Tokio task which downloads all the layers from the current
/// heatmap on demand.
heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
}
pub(crate) enum PreviousHeatmap {
@@ -2368,6 +2370,9 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
}
/// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
/// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
/// possible that the index part persists the state while the config doesn't get persisted.
pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2376,6 +2381,14 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
}
pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
self.rel_size_v2_status
.load()
.as_ref()
.map(|s| s.as_ref().clone())
.unwrap_or(RelSizeMigration::Legacy)
}
fn get_compaction_upper_limit(&self) -> usize {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2636,6 +2649,7 @@ impl Timeline {
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
create_idempotency: crate::tenant::CreateTimelineIdempotency,
gc_compaction_state: Option<GcCompactionState>,
rel_size_v2_status: Option<RelSizeMigration>,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2794,6 +2808,8 @@ impl Timeline {
previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
heatmap_layers_downloader: Mutex::new(None),
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
};
result.repartition_threshold =
@@ -2870,6 +2886,16 @@ impl Timeline {
.schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
}
pub(crate) fn update_rel_size_v2_status(
&self,
rel_size_v2_status: RelSizeMigration,
) -> anyhow::Result<()> {
self.rel_size_v2_status
.store(Some(Arc::new(rel_size_v2_status.clone())));
self.remote_client
.schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
}
pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
self.gc_compaction_state.load_full().as_ref().clone()
}

View File

@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
CreateTimelineCause::Delete,
crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
None, // doesn't matter what we put here
None, // doesn't matter what we put here
)
.context("create_timeline_struct")?;

View File

@@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2(
],
)
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
!= "legacy"
)
n = 100000
step = 5000
# Create many relations

View File

@@ -358,6 +358,21 @@ def test_tx_abort_with_many_relations(
],
)
if reldir_type == "v1":
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "legacy"
)
else:
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
!= "legacy"
)
# How many relations: this number is tuned to be long enough to take tens of seconds
# if the rollback code path is buggy, tripping the test's timeout.
if reldir_type == "v1":

View File

@@ -19,6 +19,17 @@ def test_pageserver_reldir_v2(
endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "legacy"
)
# Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do
# a "wait_flush_lsn" here, but it's easier to just do a restart.
env.pageserver.restart()
# Switch to v2
env.pageserver.http_client().update_tenant_config(
env.initial_tenant,
@@ -27,6 +38,13 @@ def test_pageserver_reldir_v2(
},
)
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "legacy"
)
# Check if both relations are still accessible
endpoint.safe_psql("SELECT * FROM foo1")
endpoint.safe_psql("SELECT * FROM foo2")
@@ -41,12 +59,14 @@ def test_pageserver_reldir_v2(
# Create a relation in v2
endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)")
# Delete a relation in v1
endpoint.safe_psql("DROP TABLE foo1")
# Check if both relations are still accessible
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo3")
endpoint.safe_psql("SELECT * FROM foo4")
# Restart the endpoint
endpoint.stop()
@@ -57,7 +77,7 @@ def test_pageserver_reldir_v2(
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo3")
endpoint.safe_psql("SELECT * FROM foo4")
endpoint.safe_psql("DROP TABLE foo3")
endpoint.stop()
endpoint.start()
@@ -66,3 +86,25 @@ def test_pageserver_reldir_v2(
endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
endpoint.safe_psql("SELECT * FROM foo4")
# Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached.
env.pageserver.http_client().update_tenant_config(
env.initial_tenant,
{
"rel_size_v2_enabled": False,
},
)
# Check if the relation is still accessible
endpoint.safe_psql("SELECT * FROM foo2")
endpoint.safe_psql("SELECT * FROM foo4")
env.pageserver.restart()
assert (
env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
"rel_size_migration"
]
== "migrating"
)