Compare commits

...

3 Commits

Author SHA1 Message Date
John Spray
62aa8f2ba2 hacky way to make tests pass 2024-10-04 09:56:56 +00:00
John Spray
efc3f1cfe5 DNM: test-log 2024-10-03 11:29:06 +00:00
John Spray
b66bf890fa storage controller: fix a disagreement between schedule + optimize
Closes: https://github.com/neondatabase/neon/issues/8969
2024-10-03 11:14:34 +00:00
5 changed files with 312 additions and 27 deletions

102
Cargo.lock generated
View File

@@ -82,12 +82,27 @@ dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"anstyle-wincon 1.0.1",
"colorchoice",
"is-terminal",
"utf8parse",
]
[[package]]
name = "anstream"
version = "0.6.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon 3.0.4",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.8"
@@ -122,6 +137,16 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
]
[[package]]
name = "anyhow"
version = "1.0.71"
@@ -1185,7 +1210,7 @@ version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
dependencies = [
"anstream",
"anstream 0.3.2",
"anstyle",
"bitflags 1.3.2",
"clap_lex",
@@ -1924,6 +1949,15 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "env_filter"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
dependencies = [
"log",
]
[[package]]
name = "env_logger"
version = "0.10.2"
@@ -1937,6 +1971,18 @@ dependencies = [
"termcolor",
]
[[package]]
name = "env_logger"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d"
dependencies = [
"anstream 0.6.15",
"anstyle",
"env_filter",
"log",
]
[[package]]
name = "equivalent"
version = "1.0.1"
@@ -2811,6 +2857,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.10.5"
@@ -3254,6 +3306,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3537,6 +3599,12 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -4119,7 +4187,7 @@ dependencies = [
"bindgen",
"bytes",
"crc32c",
"env_logger",
"env_logger 0.10.2",
"log",
"memoffset 0.9.0",
"once_cell",
@@ -4312,7 +4380,7 @@ dependencies = [
"consumption_metrics",
"dashmap",
"ecdsa 0.16.9",
"env_logger",
"env_logger 0.10.2",
"fallible-iterator",
"framed-websockets",
"futures",
@@ -5787,6 +5855,7 @@ dependencies = [
"serde_json",
"strum",
"strum_macros",
"test-log",
"thiserror",
"tokio",
"tokio-util",
@@ -6039,6 +6108,28 @@ dependencies = [
"syn 2.0.52",
]
[[package]]
name = "test-log"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dffced63c2b5c7be278154d76b479f9f9920ed34e7574201407f0b14e2bbb93"
dependencies = [
"env_logger 0.11.2",
"test-log-macros",
"tracing-subscriber",
]
[[package]]
name = "test-log-macros"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5999e24eaa32083191ba4e425deb75cdf25efefabe5aaccb7446dd0d4122a3f5"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "thiserror"
version = "1.0.57"
@@ -6570,6 +6661,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -6874,7 +6966,7 @@ dependencies = [
"anyhow",
"camino-tempfile",
"clap",
"env_logger",
"env_logger 0.10.2",
"log",
"postgres",
"postgres_ffi",

View File

@@ -56,3 +56,6 @@ utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
[dev-dependencies]
test-log = "*"

View File

@@ -206,6 +206,10 @@ pub(crate) struct NodeSecondarySchedulingScore {
/// The number of shards belonging to the tenant currently being
/// scheduled that are attached to this node.
affinity_score: AffinityScore,
/// Size of [`ScheduleContext::attached_nodes`] for the current node.
/// This normally tracks the number of attached shards belonging to the
/// tenant being scheduled that are already on this node.
secondary_shards_in_context: usize,
/// Utilisation score that combines shard count and disk utilisation
utilization_score: u64,
/// Total number of shards attached to this node. When nodes have identical utilisation, this
@@ -231,6 +235,7 @@ impl NodeSchedulingScore for NodeSecondarySchedulingScore {
Some(Self {
az_match: SecondaryAzMatch(AzMatch::new(&node.az, preferred_az.as_ref())),
secondary_shards_in_context: context.secondary_nodes.get(node_id).copied().unwrap_or(0),
affinity_score: context
.nodes
.get(node_id)
@@ -327,6 +332,9 @@ pub(crate) struct ScheduleContext {
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
/// Specifically how many _secondary_ locations are on each node
pub(crate) secondary_nodes: HashMap<NodeId, usize>,
pub(crate) mode: ScheduleMode,
}
@@ -345,6 +353,11 @@ impl ScheduleContext {
*entry += 1;
}
pub(crate) fn push_secondary(&mut self, node_id: NodeId) {
let entry = self.secondary_nodes.entry(node_id).or_default();
*entry += 1;
}
pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
self.nodes
.get(&node_id)
@@ -786,7 +799,14 @@ pub(crate) mod test_utils {
#[cfg(test)]
mod tests {
use pageserver_api::{controller_api::NodeAvailability, models::utilization::test_utilization};
use pageserver_api::{
controller_api::NodeAvailability, models::utilization::test_utilization,
shard::ShardIdentity, shard::TenantShardId,
};
use utils::{
id::TenantId,
shard::{ShardCount, ShardNumber},
};
use super::*;
@@ -1074,4 +1094,171 @@ mod tests {
intent.clear(&mut scheduler);
}
}
#[test]
fn repro_foo() {
let az_tag = AvailabilityZone("az-a".to_string());
let nodes = test_utils::make_test_nodes(
5,
&[
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
az_tag.clone(),
],
);
let mut scheduler = Scheduler::new(nodes.values());
// Need to keep these alive because they contribute to shard counts via RAII
let mut scheduled_shards = Vec::new();
let mut context = ScheduleContext::default();
fn schedule_shard(
tenant_shard_id: TenantShardId,
expect_attached: NodeId,
expect_secondary: NodeId,
scheduled_shards: &mut Vec<TenantShard>,
scheduler: &mut Scheduler,
context: &mut ScheduleContext,
) {
let shard_identity = ShardIdentity::new(
tenant_shard_id.shard_number,
tenant_shard_id.shard_count,
pageserver_api::shard::ShardStripeSize(1),
)
.unwrap();
let mut shard = TenantShard::new(
tenant_shard_id,
shard_identity,
pageserver_api::controller_api::PlacementPolicy::Attached(1),
);
shard.schedule(scheduler, context).unwrap();
assert_eq!(shard.intent.get_attached().unwrap(), expect_attached);
assert_eq!(
shard.intent.get_secondary().first().unwrap(),
&expect_secondary
);
scheduled_shards.push(shard);
}
let tenant_id = TenantId::generate();
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(2),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(1),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(4),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(2),
shard_count: ShardCount(8),
},
NodeId(5),
NodeId(1),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(3),
shard_count: ShardCount(8),
},
NodeId(2),
NodeId(3),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(4),
shard_count: ShardCount(8),
},
NodeId(4),
NodeId(5),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(5),
shard_count: ShardCount(8),
},
NodeId(1),
NodeId(2),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(6),
shard_count: ShardCount(8),
},
NodeId(3),
NodeId(4),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
schedule_shard(
TenantShardId {
tenant_id,
shard_number: ShardNumber(7),
shard_count: ShardCount(8),
},
NodeId(5),
NodeId(1),
&mut scheduled_shards,
&mut scheduler,
&mut context,
);
for shard in &scheduled_shards {
assert_eq!(shard.optimize_attachment(&nodes, &context), None);
}
for mut shard in scheduled_shards {
shard.intent.clear(&mut scheduler);
}
}
}

View File

@@ -5858,10 +5858,7 @@ impl Service {
// Accumulate the schedule context for all the shards in a tenant: we must have
// the total view of all shards before we can try to optimize any of them.
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
shard.populate_context(&mut schedule_context);
tenant_shards.push(shard);
// Once we have seen the last shard in the tenant, proceed to search across all shards

View File

@@ -566,10 +566,7 @@ impl TenantShard {
) -> Result<(), ScheduleError> {
let r = self.do_schedule(scheduler, context);
context.avoid(&self.intent.all_pageservers());
if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
self.populate_context(context);
r
}
@@ -676,6 +673,19 @@ impl TenantShard {
Ok(())
}
/// When building the ScheduleContext of a tenant, call this on each shard to
/// add its contribution to the context.
pub(crate) fn populate_context(&self, context: &mut ScheduleContext) {
context.avoid(&self.intent.all_pageservers());
if let Some(attached) = self.intent.get_attached() {
context.push_attached(*attached);
}
for secondary in self.intent.get_secondary() {
context.push_secondary(*secondary);
}
}
/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
/// if the swap is not possible and leaves the intent state in its original state.
///
@@ -823,10 +833,13 @@ impl TenantShard {
continue;
};
// TODO: make this AZ aware: secondary should be chosen "As if I am an attachment, but
// in a different AZ to my actual preferred AZ"
// Let the scheduler suggest a node, where it would put us if we were scheduling afresh
// This implicitly limits the choice to nodes that are available, and prefers nodes
// with lower utilization.
let Ok(candidate_node) = scheduler.schedule_shard::<SecondaryShardTag>(
let Ok(candidate_node) = scheduler.schedule_shard::<AttachedShardTag>(
&self.intent.all_pageservers(),
&self.preferred_az_id,
schedule_context,
@@ -1632,10 +1645,8 @@ pub(crate) mod tests {
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
shard_a.populate_context(&mut schedule_context);
shard_b.populate_context(&mut schedule_context);
let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
@@ -1699,10 +1710,8 @@ pub(crate) mod tests {
shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
let mut schedule_context = ScheduleContext::default();
schedule_context.avoid(&shard_a.intent.all_pageservers());
schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
schedule_context.avoid(&shard_b.intent.all_pageservers());
schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
shard_a.populate_context(&mut schedule_context);
shard_b.populate_context(&mut schedule_context);
let optimization_a = shard_a.optimize_secondary(&mut scheduler, &schedule_context);
@@ -1744,10 +1753,7 @@ pub(crate) mod tests {
let mut any_changed = false;
for shard in shards.iter() {
schedule_context.avoid(&shard.intent.all_pageservers());
if let Some(attached) = shard.intent.get_attached() {
schedule_context.push_attached(*attached);
}
shard.populate_context(&mut schedule_context);
}
for shard in shards.iter_mut() {