mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
pageserver: assert no changes to shard identity (#12379)
## Problem Location config changes can currently result in changes to the shard identity. Such changes will cause data corruption, as seen with #12217. Resolves #12227. Requires #12377. ## Summary of changes Assert that the shard identity does not change on location config updates and on (re)attach. This is currently asserted with `critical!`, in case it misfires in production. Later, we should reject such requests with an error and turn this into a proper assertion.
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -4421,6 +4421,7 @@ dependencies = [
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -30,12 +30,13 @@ humantime-serde.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
itertools.workspace = true
|
||||
storage_broker.workspace = true
|
||||
camino = {workspace = true, features = ["serde1"]}
|
||||
camino = { workspace = true, features = ["serde1"] }
|
||||
remote_storage.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
nix = {workspace = true, optional = true}
|
||||
nix = { workspace = true, optional = true }
|
||||
reqwest.workspace = true
|
||||
rand.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher};
|
||||
pub use ::utils::shard::*;
|
||||
use postgres_ffi_types::forknum::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::critical;
|
||||
|
||||
use crate::key::Key;
|
||||
use crate::models::ShardParameters;
|
||||
@@ -188,6 +189,17 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Asserts that the given shard identities are equal. Changes to shard parameters will likely
|
||||
/// result in data corruption.
|
||||
pub fn assert_equal(&self, other: ShardIdentity) {
|
||||
if self != &other {
|
||||
// TODO: for now, we're conservative and just log errors in production. Turn this into a
|
||||
// real assertion when we're confident it doesn't misfire, and also reject requests that
|
||||
// attempt to change it with an error response.
|
||||
critical!("shard identity mismatch: {self:?} != {other:?}");
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
@@ -1896,6 +1896,10 @@ async fn update_tenant_config_handler(
|
||||
ShardParameters::from(tenant.get_shard_identity()),
|
||||
);
|
||||
|
||||
tenant
|
||||
.get_shard_identity()
|
||||
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
@@ -1940,6 +1944,10 @@ async fn patch_tenant_config_handler(
|
||||
ShardParameters::from(tenant.get_shard_identity()),
|
||||
);
|
||||
|
||||
tenant
|
||||
.get_shard_identity()
|
||||
.assert_equal(location_conf.shard); // not strictly necessary since we construct it above
|
||||
|
||||
crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
|
||||
|
||||
@@ -4529,6 +4529,10 @@ impl TenantShard {
|
||||
Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
|
||||
}
|
||||
|
||||
/// Stores a tenant location config to disk.
|
||||
///
|
||||
/// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid
|
||||
/// changes to shard parameters that may result in data corruption.
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) async fn persist_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
use pageserver_api::models;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::critical;
|
||||
use utils::generation::Generation;
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@@ -171,6 +172,16 @@ impl LocationConf {
|
||||
}
|
||||
}
|
||||
|
||||
// This should never happen.
|
||||
// TODO: turn this into a proper assertion.
|
||||
if stripe_size != self.shard.stripe_size {
|
||||
critical!(
|
||||
"stripe size mismatch: {} != {}",
|
||||
self.shard.stripe_size,
|
||||
stripe_size,
|
||||
);
|
||||
}
|
||||
|
||||
self.shard.stripe_size = stripe_size;
|
||||
}
|
||||
|
||||
|
||||
@@ -880,6 +880,9 @@ impl TenantManager {
|
||||
// phase of writing config and/or waiting for flush, before returning.
|
||||
match fast_path_taken {
|
||||
Some(FastPathModified::Attached(tenant)) => {
|
||||
tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
@@ -914,7 +917,10 @@ impl TenantManager {
|
||||
|
||||
return Ok(Some(tenant));
|
||||
}
|
||||
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||
Some(FastPathModified::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
TenantShard::persist_tenant_config(
|
||||
self.conf,
|
||||
&tenant_shard_id,
|
||||
@@ -948,6 +954,10 @@ impl TenantManager {
|
||||
|
||||
match slot_guard.get_old_value() {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
|
||||
// The case where we keep a Tenant alive was covered above in the special case
|
||||
// for Attached->Attached transitions in the same generation. By this point,
|
||||
// if we see an attached tenant we know it will be discarded and should be
|
||||
@@ -981,9 +991,13 @@ impl TenantManager {
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Eager;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant
|
||||
.shard_identity
|
||||
.assert_equal(new_location_config.shard);
|
||||
|
||||
info!("Shutting down secondary tenant");
|
||||
state.shutdown().await;
|
||||
secondary_tenant.shutdown().await;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => {
|
||||
// This should never happen: acquire_slot should error out
|
||||
|
||||
@@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant {
|
||||
// Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However,
|
||||
// storing these enables us to report our full LocationConf, enabling convenient reconciliation
|
||||
// by the control plane (see [`Self::get_location_conf`])
|
||||
shard_identity: ShardIdentity,
|
||||
pub(crate) shard_identity: ShardIdentity,
|
||||
tenant_conf: std::sync::Mutex<pageserver_api::models::TenantConfig>,
|
||||
|
||||
// Internal state used by the Downloader.
|
||||
|
||||
Reference in New Issue
Block a user