pageserver: persist stripe size in tenant manifest for tenant_import (#11181)

## Problem

`tenant_import`, used to import an existing tenant from remote storage
into a storage controller for support and debugging, assumed
`DEFAULT_STRIPE_SIZE` since this can't be recovered from remote storage.
In #11168, we are changing the stripe size, which will break
`tenant_import`.

Resolves #11175.

## Summary of changes

* Add `stripe_size` to the tenant manifest.
* Add `TenantScanRemoteStorageShard::stripe_size` and return from
`tenant_scan_remote` if present.
* Recover the stripe size during`tenant_import`, or fall back to 32768
(the original default stripe size).
* Add tenant manifest compatibility snapshot:
`2025-04-08-pgv17-tenant-manifest-v1.tar.zst`

There are no cross-version concerns here, since unknown fields are
ignored during deserialization where relevant.
This commit is contained in:
Erik Grinaker
2025-04-08 22:43:27 +02:00
committed by GitHub
parent d177654e5f
commit 7679b63a2c
7 changed files with 105 additions and 6 deletions

View File

@@ -1680,6 +1680,7 @@ pub struct SecondaryProgress {
pub struct TenantScanRemoteStorageShard {
pub tenant_shard_id: TenantShardId,
pub generation: Option<u32>,
pub stripe_size: Option<ShardStripeSize>,
}
#[derive(Serialize, Deserialize, Debug, Default)]

View File

@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
}
}
impl std::fmt::Display for ShardStripeSize {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
pub struct ShardLayout(u8);

View File

@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
};
use crate::tenant::remote_timeline_client::index::GcCompactionState;
use crate::tenant::remote_timeline_client::{
download_index_part, list_remote_tenant_shards, list_remote_timelines,
download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
@@ -2911,9 +2911,22 @@ async fn tenant_scan_remote_handler(
};
}
let result =
download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
.instrument(info_span!("download_tenant_manifest",
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug()))
.await;
let stripe_size = match result {
Ok((manifest, _, _)) => manifest.stripe_size,
Err(DownloadError::NotFound) => None,
Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
};
response.shards.push(TenantScanRemoteStorageShard {
tenant_shard_id,
generation: generation.into(),
stripe_size,
});
}

View File

@@ -4079,6 +4079,7 @@ impl Tenant {
TenantManifest {
version: LATEST_TENANT_MANIFEST_VERSION,
stripe_size: Some(self.get_shard_stripe_size()),
offloaded_timelines,
}
}

View File

@@ -1,4 +1,5 @@
use chrono::NaiveDateTime;
use pageserver_api::shard::ShardStripeSize;
use serde::{Deserialize, Serialize};
use utils::id::TimelineId;
use utils::lsn::Lsn;
@@ -14,6 +15,12 @@ pub struct TenantManifest {
/// allow release rollbacks.
pub version: usize,
/// This tenant's stripe size. This is only advisory, and used to recover tenant data from
/// remote storage. The autoritative source is the storage controller. If None, assume the
/// original default value of 32768 blocks (256 MB).
#[serde(skip_serializing_if = "Option::is_none")]
pub stripe_size: Option<ShardStripeSize>,
/// The list of offloaded timelines together with enough information
/// to not have to actually load them.
///
@@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest {
/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
/// do not use deny_unknown_fields, so new fields are not breaking.
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
///
/// 1: initial version
/// 2: +stripe_size
///
/// When adding new versions, also add a parse_vX test case below.
pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
impl TenantManifest {
/// Returns true if the manifests are equal, ignoring the version number. This avoids
@@ -56,10 +68,11 @@ impl TenantManifest {
// We could alternatively just clone and modify the version here.
let Self {
version: _, // ignore version
stripe_size,
offloaded_timelines,
} = self;
offloaded_timelines == &other.offloaded_timelines
stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
}
/// Decodes a manifest from JSON.
@@ -89,6 +102,7 @@ mod tests {
}"#;
let expected = TenantManifest {
version: 0,
stripe_size: None,
offloaded_timelines: Vec::new(),
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -104,6 +118,7 @@ mod tests {
}"#;
let expected = TenantManifest {
version: 1,
stripe_size: None,
offloaded_timelines: Vec::new(),
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -130,6 +145,50 @@ mod tests {
}"#;
let expected = TenantManifest {
version: 1,
stripe_size: None,
offloaded_timelines: vec![
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
ancestor_timeline_id: None,
ancestor_retain_lsn: None,
archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
},
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
ancestor_timeline_id: Some(TimelineId::from_str(
"5c4df612fd159e63c1b7853fe94d97da",
)?),
ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
},
],
};
assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
Ok(())
}
/// v2 manifests should be parsed, for backwards compatibility.
#[test]
fn parse_v2() -> anyhow::Result<()> {
let json = r#"{
"version": 2,
"stripe_size": 32768,
"offloaded_timelines": [
{
"timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"archived_at": "2025-03-07T11:07:11.373105434"
},
{
"timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
"ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
"ancestor_retain_lsn": "0/1F79038",
"archived_at": "2025-03-05T11:10:22.257901390"
}
]
}"#;
let expected = TenantManifest {
version: 2,
stripe_size: Some(ShardStripeSize(32768)),
offloaded_timelines: vec![
OffloadedTimelineManifest {
timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,

View File

@@ -6014,9 +6014,21 @@ impl Service {
.max()
.expect("We already validated >0 shards");
// FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
// only work if they were using the default stripe size.
let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
// Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so
// fall back to the original default stripe size of 32768 (256 MB) if it's not specified.
const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768);
let stripe_size = scan_result
.shards
.iter()
.find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation)
.expect("we validated >0 shards above")
.stripe_size
.unwrap_or_else(|| {
if shard_count.count() > 1 {
warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}");
}
ORIGINAL_STRIPE_SIZE
});
let (response, waiters) = self
.do_tenant_create(TenantCreateRequest {

View File

@@ -492,6 +492,13 @@ HISTORIC_DATA_SETS = [
PgVersion.V17,
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
),
# Tenant manifest v1.
HistoricDataSet(
"2025-04-08-tenant-manifest-v1",
TenantId("c547c28588abf1d7b7139ff1f1158345"),
PgVersion.V17,
"https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst",
),
]