mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 12:32:54 +00:00
storage controller: fix heatmaps getting disabled during shard split (#8197)
## Problem At the start of do_tenant_shard_split, we drop any secondary location for the parent shards. The reconciler uses presence of secondary locations as a condition for enabling heatmaps. On the pageserver, child shards inherit their configuration from parents, but the storage controller assumes the child's ObservedState is the same as the parent's config from the prepare phase. The result is that some child shards end up with inaccurate ObservedState, and until something next migrates or restarts, those tenant shards aren't uploading heatmaps, so their secondary locations are downloading everything that was resident at the moment of the split (including ancestor layers which are often cleaned up shortly after the split). Closes: https://github.com/neondatabase/neon/issues/8189 ## Summary of changes - Use PlacementPolicy to control enablement of heatmap upload, rather than the literal presence of secondaries in IntentState: this way we avoid switching them off during shard split - test: during tenant split test, assert that the child shards have heatmap uploads enabled.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
use crate::pageserver_client::PageserverClient;
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use pageserver_api::controller_api::PlacementPolicy;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -29,6 +30,7 @@ pub(super) struct Reconciler {
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) placement_policy: PlacementPolicy,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) intent: TargetState,
|
||||
|
||||
@@ -641,7 +643,7 @@ impl Reconciler {
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
&self.placement_policy,
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
@@ -801,8 +803,15 @@ pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
has_secondaries: bool,
|
||||
policy: &PlacementPolicy,
|
||||
) -> LocationConfig {
|
||||
let has_secondaries = match policy {
|
||||
PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
|
||||
false
|
||||
}
|
||||
PlacementPolicy::Attached(_) => true,
|
||||
};
|
||||
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
generation: generation.into(),
|
||||
|
||||
@@ -1390,7 +1390,7 @@ impl Service {
|
||||
tenant_shard.generation.unwrap(),
|
||||
&tenant_shard.shard,
|
||||
&tenant_shard.config,
|
||||
false,
|
||||
&PlacementPolicy::Attached(0),
|
||||
)),
|
||||
},
|
||||
)]);
|
||||
@@ -3321,7 +3321,7 @@ impl Service {
|
||||
generation,
|
||||
&child_shard,
|
||||
&config,
|
||||
matches!(policy, PlacementPolicy::Attached(n) if n > 0),
|
||||
&policy,
|
||||
)),
|
||||
},
|
||||
);
|
||||
|
||||
@@ -908,12 +908,8 @@ impl TenantShard {
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
let wanted_conf =
|
||||
attached_location_conf(generation, &self.shard, &self.config, &self.policy);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
@@ -1099,6 +1095,7 @@ impl TenantShard {
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
shard: self.shard,
|
||||
placement_policy: self.policy.clone(),
|
||||
generation: self.generation,
|
||||
intent: reconciler_intent,
|
||||
detach,
|
||||
|
||||
@@ -542,6 +542,13 @@ def test_sharding_split_smoke(
|
||||
for k, v in non_default_tenant_config.items():
|
||||
assert config.effective_config[k] == v
|
||||
|
||||
# Check that heatmap uploads remain enabled after shard split
|
||||
# (https://github.com/neondatabase/neon/issues/8189)
|
||||
assert (
|
||||
config.effective_config["heatmap_period"]
|
||||
and config.effective_config["heatmap_period"] != "0s"
|
||||
)
|
||||
|
||||
# Validate pageserver state: expect every child shard to have an attached and secondary location
|
||||
(total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
|
||||
assert sum(attached.values()) == split_shard_count
|
||||
|
||||
Reference in New Issue
Block a user