diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2ffff67688..8186889e10 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1680,6 +1680,7 @@ pub struct SecondaryProgress { pub struct TenantScanRemoteStorageShard { pub tenant_shard_id: TenantShardId, pub generation: Option, + pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug, Default)] diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8386d6e586..abbf4e6432 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -78,6 +78,12 @@ impl Default for ShardStripeSize { } } +impl std::fmt::Display for ShardStripeSize { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cf67dc596a..bce590016e 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -67,7 +67,7 @@ use crate::tenant::mgr::{ }; use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ - download_index_part, list_remote_tenant_shards, list_remote_timelines, + download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines, }; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; @@ -2911,9 +2911,22 @@ async fn tenant_scan_remote_handler( }; } + let result = + download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel) + .instrument(info_span!("download_tenant_manifest", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug())) + .await; + let stripe_size = match result { + Ok((manifest, _, _)) => manifest.stripe_size, + Err(DownloadError::NotFound) => None, + Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))), + }; + response.shards.push(TenantScanRemoteStorageShard { tenant_shard_id, generation: generation.into(), + stripe_size, }); } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1bfc51d5c8..900e98d7e9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4079,6 +4079,7 @@ impl Tenant { TenantManifest { version: LATEST_TENANT_MANIFEST_VERSION, + stripe_size: Some(self.get_shard_stripe_size()), offloaded_timelines, } } diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 0e07acfbc8..7dba4508e2 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,4 +1,5 @@ use chrono::NaiveDateTime; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -14,6 +15,12 @@ pub struct TenantManifest { /// allow release rollbacks. pub version: usize, + /// This tenant's stripe size. This is only advisory, and used to recover tenant data from + /// remote storage. The autoritative source is the storage controller. If None, assume the + /// original default value of 32768 blocks (256 MB). + #[serde(skip_serializing_if = "Option::is_none")] + pub stripe_size: Option, + /// The list of offloaded timelines together with enough information /// to not have to actually load them. /// @@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest { /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We /// do not use deny_unknown_fields, so new fields are not breaking. -pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1; +/// +/// 1: initial version +/// 2: +stripe_size +/// +/// When adding new versions, also add a parse_vX test case below. +pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2; impl TenantManifest { /// Returns true if the manifests are equal, ignoring the version number. This avoids @@ -56,10 +68,11 @@ impl TenantManifest { // We could alternatively just clone and modify the version here. let Self { version: _, // ignore version + stripe_size, offloaded_timelines, } = self; - offloaded_timelines == &other.offloaded_timelines + stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines } /// Decodes a manifest from JSON. @@ -89,6 +102,7 @@ mod tests { }"#; let expected = TenantManifest { version: 0, + stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); @@ -104,6 +118,7 @@ mod tests { }"#; let expected = TenantManifest { version: 1, + stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); @@ -130,6 +145,50 @@ mod tests { }"#; let expected = TenantManifest { version: 1, + stripe_size: None, + offloaded_timelines: vec![ + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, + ancestor_timeline_id: None, + ancestor_retain_lsn: None, + archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, + }, + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, + ancestor_timeline_id: Some(TimelineId::from_str( + "5c4df612fd159e63c1b7853fe94d97da", + )?), + ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), + archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, + }, + ], + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } + + /// v2 manifests should be parsed, for backwards compatibility. + #[test] + fn parse_v2() -> anyhow::Result<()> { + let json = r#"{ + "version": 2, + "stripe_size": 32768, + "offloaded_timelines": [ + { + "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "archived_at": "2025-03-07T11:07:11.373105434" + }, + { + "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", + "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "ancestor_retain_lsn": "0/1F79038", + "archived_at": "2025-03-05T11:10:22.257901390" + } + ] + }"#; + let expected = TenantManifest { + version: 2, + stripe_size: Some(ShardStripeSize(32768)), offloaded_timelines: vec![ OffloadedTimelineManifest { timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 5e53051727..e4db58cc84 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -6014,9 +6014,21 @@ impl Service { .max() .expect("We already validated >0 shards"); - // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will - // only work if they were using the default stripe size. - let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so + // fall back to the original default stripe size of 32768 (256 MB) if it's not specified. + const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768); + let stripe_size = scan_result + .shards + .iter() + .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation) + .expect("we validated >0 shards above") + .stripe_size + .unwrap_or_else(|| { + if shard_count.count() > 1 { + warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}"); + } + ORIGINAL_STRIPE_SIZE + }); let (response, waiters) = self .do_tenant_create(TenantCreateRequest { diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ee96daca33..2230bdc666 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -492,6 +492,13 @@ HISTORIC_DATA_SETS = [ PgVersion.V17, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", ), + # Tenant manifest v1. + HistoricDataSet( + "2025-04-08-tenant-manifest-v1", + TenantId("c547c28588abf1d7b7139ff1f1158345"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst", + ), ]