mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-19 14:10:37 +00:00
storcon: Fix migration for Attached(0) tenants (#12256)
## Problem `Attached(0)` tenant migrations can get stuck if the heatmap file has not been uploaded. ## Summary of Changes - Added a test to reproduce the issue. - Introduced a `kick_secondary_downloads` config flag: - Enabled in testing environments. - Disabled in production (and in the new test). - Updated `Attached(0)` locations to consider the number of secondaries in their intent when deciding whether to download the heatmap.
This commit is contained in:
committed by
GitHub
parent
85164422d0
commit
5eecde461d
@@ -5,6 +5,9 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, anyhow};
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use clap::ArgAction;
|
||||
use clap::Parser;
|
||||
use futures::future::OptionFuture;
|
||||
use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
@@ -213,6 +216,13 @@ struct Cli {
|
||||
/// This option exists primarily for testing purposes.
|
||||
#[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))]
|
||||
timeline_safekeeper_count: i64,
|
||||
|
||||
/// When set, actively checks and initiates heatmap downloads/uploads during reconciliation.
|
||||
/// This speed up migrations by avoiding the default wait for the heatmap download interval.
|
||||
/// Primarily useful for testing to reduce test execution time.
|
||||
#[cfg(feature = "testing")]
|
||||
#[arg(long, default_value = "true", action=ArgAction::Set)]
|
||||
kick_secondary_downloads: bool,
|
||||
}
|
||||
|
||||
enum StrictMode {
|
||||
@@ -445,6 +455,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
timelines_onto_safekeepers: args.timelines_onto_safekeepers,
|
||||
use_local_compute_notifications: args.use_local_compute_notifications,
|
||||
timeline_safekeeper_count: args.timeline_safekeeper_count,
|
||||
#[cfg(feature = "testing")]
|
||||
kick_secondary_downloads: args.kick_secondary_downloads,
|
||||
};
|
||||
|
||||
// Validate that we can connect to the database
|
||||
|
||||
@@ -856,6 +856,7 @@ impl Reconciler {
|
||||
&self.shard,
|
||||
&self.config,
|
||||
&self.placement_policy,
|
||||
self.intent.secondary.len(),
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
@@ -1235,11 +1236,11 @@ pub(crate) fn attached_location_conf(
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
policy: &PlacementPolicy,
|
||||
secondary_count: usize,
|
||||
) -> LocationConfig {
|
||||
let has_secondaries = match policy {
|
||||
PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
|
||||
false
|
||||
}
|
||||
PlacementPolicy::Detached | PlacementPolicy::Secondary => false,
|
||||
PlacementPolicy::Attached(0) => secondary_count > 0,
|
||||
PlacementPolicy::Attached(_) => true,
|
||||
};
|
||||
|
||||
|
||||
@@ -470,6 +470,9 @@ pub struct Config {
|
||||
/// Number of safekeepers to choose for a timeline when creating it.
|
||||
/// Safekeepers will be choosen from different availability zones.
|
||||
pub timeline_safekeeper_count: i64,
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub kick_secondary_downloads: bool,
|
||||
}
|
||||
|
||||
impl From<DatabaseError> for ApiError {
|
||||
@@ -2064,6 +2067,7 @@ impl Service {
|
||||
&tenant_shard.shard,
|
||||
&tenant_shard.config,
|
||||
&PlacementPolicy::Attached(0),
|
||||
tenant_shard.intent.get_secondary().len(),
|
||||
)),
|
||||
},
|
||||
)]);
|
||||
@@ -5605,7 +5609,15 @@ impl Service {
|
||||
for parent_id in parent_ids {
|
||||
let child_ids = parent_id.split(new_shard_count);
|
||||
|
||||
let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
|
||||
let (
|
||||
pageserver,
|
||||
generation,
|
||||
policy,
|
||||
parent_ident,
|
||||
config,
|
||||
preferred_az,
|
||||
secondary_count,
|
||||
) = {
|
||||
let mut old_state = tenants
|
||||
.remove(&parent_id)
|
||||
.expect("It was present, we just split it");
|
||||
@@ -5625,6 +5637,7 @@ impl Service {
|
||||
old_state.shard,
|
||||
old_state.config.clone(),
|
||||
old_state.preferred_az().cloned(),
|
||||
old_state.intent.get_secondary().len(),
|
||||
)
|
||||
};
|
||||
|
||||
@@ -5646,6 +5659,7 @@ impl Service {
|
||||
&child_shard,
|
||||
&config,
|
||||
&policy,
|
||||
secondary_count,
|
||||
)),
|
||||
},
|
||||
);
|
||||
@@ -8373,6 +8387,11 @@ impl Service {
|
||||
/// we have this helper to move things along faster.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
|
||||
if !self.config.kick_secondary_downloads {
|
||||
// No-op if kick_secondary_downloads functionaliuty is not configured
|
||||
return;
|
||||
}
|
||||
|
||||
let (attached_node, secondaries) = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
|
||||
|
||||
@@ -1381,8 +1381,13 @@ impl TenantShard {
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf =
|
||||
attached_location_conf(generation, &self.shard, &self.config, &self.policy);
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
&self.policy,
|
||||
self.intent.get_secondary().len(),
|
||||
);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
|
||||
Reference in New Issue
Block a user