mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 20:42:54 +00:00
storage controller: improve consistency_check_api (#10363)
## Problem Limitations found while using this to investigate https://github.com/neondatabase/neon/issues/10234: - If we hit a node consistency issue, we drop out and don't check shards for consistency - The messages printed after a shard consistency issue are huge, and grafana appears to drop them. ## Summary of changes - Defer node consistency errors until the end of the function, so that we always proceed to check shards for consistency - Print out smaller log lines that just point out the diffs between expected and persistent state
This commit is contained in:
@@ -5256,7 +5256,8 @@ impl Service {
|
||||
expect_nodes.sort_by_key(|n| n.node_id);
|
||||
nodes.sort_by_key(|n| n.node_id);
|
||||
|
||||
if nodes != expect_nodes {
|
||||
// Errors relating to nodes are deferred so that we don't skip the shard checks below if we have a node error
|
||||
let node_result = if nodes != expect_nodes {
|
||||
tracing::error!("Consistency check failed on nodes.");
|
||||
tracing::error!(
|
||||
"Nodes in memory: {}",
|
||||
@@ -5268,10 +5269,12 @@ impl Service {
|
||||
serde_json::to_string(&nodes)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Node consistency failure"
|
||||
)));
|
||||
}
|
||||
)))
|
||||
} else {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
let mut persistent_shards = self.persistence.load_active_tenant_shards().await?;
|
||||
persistent_shards
|
||||
@@ -5281,6 +5284,7 @@ impl Service {
|
||||
|
||||
if persistent_shards != expect_shards {
|
||||
tracing::error!("Consistency check failed on shards.");
|
||||
|
||||
tracing::error!(
|
||||
"Shards in memory: {}",
|
||||
serde_json::to_string(&expect_shards)
|
||||
@@ -5291,12 +5295,57 @@ impl Service {
|
||||
serde_json::to_string(&persistent_shards)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
|
||||
// The total dump log lines above are useful in testing but in the field grafana will
|
||||
// usually just drop them because they're so large. So we also do some explicit logging
|
||||
// of just the diffs.
|
||||
let persistent_shards = persistent_shards
|
||||
.into_iter()
|
||||
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let expect_shards = expect_shards
|
||||
.into_iter()
|
||||
.map(|tsp| (tsp.get_tenant_shard_id().unwrap(), tsp))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for (tenant_shard_id, persistent_tsp) in &persistent_shards {
|
||||
match expect_shards.get(tenant_shard_id) {
|
||||
None => {
|
||||
tracing::error!(
|
||||
"Shard {} found in database but not in memory",
|
||||
tenant_shard_id
|
||||
);
|
||||
}
|
||||
Some(expect_tsp) => {
|
||||
if expect_tsp != persistent_tsp {
|
||||
tracing::error!(
|
||||
"Shard {} is inconsistent. In memory: {}, database has: {}",
|
||||
tenant_shard_id,
|
||||
serde_json::to_string(expect_tsp).unwrap(),
|
||||
serde_json::to_string(&persistent_tsp).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Having already logged any differences, log any shards that simply aren't present in the database
|
||||
for (tenant_shard_id, memory_tsp) in &expect_shards {
|
||||
if !persistent_shards.contains_key(tenant_shard_id) {
|
||||
tracing::error!(
|
||||
"Shard {} found in memory but not in database: {}",
|
||||
tenant_shard_id,
|
||||
serde_json::to_string(memory_tsp)
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard consistency failure"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
node_result
|
||||
}
|
||||
|
||||
/// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that
|
||||
|
||||
Reference in New Issue
Block a user