mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
storcon: validate new_sk_set before starting safekeeper migration (#12546)
## Problem We don't validate the validity of the `new_sk_set` before starting the migration. It is validated later, so the migration to an invalid safekeeper set will fail anyway. But at this point we might already commited an invalid `new_sk_set` to the database and there is no `abort` command yet (I ran into this issue in neon_local and ruined the timeline :) - Part of https://github.com/neondatabase/neon/issues/11669 ## Summary of changes - Add safekeeper count and safekeeper duplication checks before starting the migration - Test that we validate the `new_sk_set` before starting the migration - Add `force` option to the `TimelineSafekeeperMigrateRequest` to disable not-mandatory checks
This commit is contained in:
@@ -39,13 +39,13 @@ use utils::lsn::Lsn;
|
||||
use super::Service;
|
||||
|
||||
impl Service {
|
||||
fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
|
||||
fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, anyhow::Error> {
|
||||
let members = safekeepers
|
||||
.iter()
|
||||
.map(|sk| sk.get_safekeeper_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
MemberSet::new(members).map_err(ApiError::InternalServerError)
|
||||
MemberSet::new(members)
|
||||
}
|
||||
|
||||
fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
|
||||
@@ -80,7 +80,7 @@ impl Service {
|
||||
) -> Result<Vec<NodeId>, ApiError> {
|
||||
let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;
|
||||
|
||||
let mset = Self::make_member_set(&safekeepers)?;
|
||||
let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?;
|
||||
let mconf = safekeeper_api::membership::Configuration::new(mset);
|
||||
|
||||
let req = safekeeper_api::models::TimelineCreateRequest {
|
||||
@@ -1105,6 +1105,26 @@ impl Service {
|
||||
}
|
||||
}
|
||||
|
||||
if new_sk_set.is_empty() {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"new safekeeper set is empty"
|
||||
)));
|
||||
}
|
||||
|
||||
if new_sk_set.len() < self.config.timeline_safekeeper_count {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"new safekeeper set must have at least {} safekeepers",
|
||||
self.config.timeline_safekeeper_count
|
||||
)));
|
||||
}
|
||||
|
||||
let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
|
||||
let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
|
||||
// Construct new member set in advance to validate it.
|
||||
// E.g. validates that there is no duplicate safekeepers.
|
||||
let new_sk_member_set =
|
||||
Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?;
|
||||
|
||||
// TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
|
||||
let _tenant_lock = trace_shared_lock(
|
||||
&self.tenant_op_locks,
|
||||
@@ -1135,6 +1155,18 @@ impl Service {
|
||||
.map(|&id| NodeId(id as u64))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Validate that we are not migrating to a decomissioned safekeeper.
|
||||
for sk in new_safekeepers.iter() {
|
||||
if !cur_sk_set.contains(&sk.get_id())
|
||||
&& sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned
|
||||
{
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"safekeeper {} is decomissioned",
|
||||
sk.get_id()
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
?cur_sk_set,
|
||||
?new_sk_set,
|
||||
@@ -1177,11 +1209,8 @@ impl Service {
|
||||
}
|
||||
|
||||
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
|
||||
let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
|
||||
|
||||
let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
|
||||
let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
|
||||
let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
|
||||
let cur_sk_member_set =
|
||||
Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let joint_config = membership::Configuration {
|
||||
generation,
|
||||
|
||||
Reference in New Issue
Block a user