mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 10:00:38 +00:00
## Problem Shard splits worked, but weren't safe against failures (e.g. node crash during split) yet. Related: #6676 ## Summary of changes - Introduce async rwlocks at the scope of Tenant and Node: - exclusive tenant lock is used to protect splits - exclusive node lock is used to protect new reconciliation process that happens when setting node active - exclusive locks used in both cases when doing persistent updates (e.g. node scheduling conf) where the update to DB & in-memory state needs to be atomic. - Add failpoints to shard splitting in control plane and pageserver code. - Implement error handling in control plane for shard splits: this detaches child chards and ensures parent shards are re-attached. - Crash-safety for storage controller restarts requires little effort: we already reconcile with nodes over a storage controller restart, so as long as we reset any incomplete splits in the DB on restart (added in this PR), things are implicitly cleaned up. - Implement reconciliation with offline nodes before they transition to active: - (in this context reconciliation means something like startup_reconcile, not literally the Reconciler) - This covers cases where split abort cannot reach a node to clean it up: the cleanup will eventually happen when the node is marked active, as part of reconciliation. - This also covers the case where a node was unavailable when the storage controller started, but becomes available later: previously this allowed it to skip the startup reconcile. - Storage controller now terminates on panics. We only use panics for true "should never happen" assertions, and these cases can leave us in an un-usable state if we keep running (e.g. panicking in a shard split). In the unlikely event that we get into a crashloop as a result, we'll rely on kubernetes to back us off. - Add `test_sharding_split_failures` which exercises a variety of failure cases during shard split.
49 lines
1.2 KiB
TOML
49 lines
1.2 KiB
TOML
[package]
|
|
name = "attachment_service"
|
|
version = "0.1.0"
|
|
edition.workspace = true
|
|
license.workspace = true
|
|
|
|
[[bin]]
|
|
name = "storage_controller"
|
|
path = "src/main.rs"
|
|
|
|
[features]
|
|
default = []
|
|
# Enables test-only APIs and behaviors
|
|
testing = []
|
|
|
|
[dependencies]
|
|
anyhow.workspace = true
|
|
aws-config.workspace = true
|
|
aws-sdk-secretsmanager.workspace = true
|
|
camino.workspace = true
|
|
clap.workspace = true
|
|
fail.workspace = true
|
|
futures.workspace = true
|
|
git-version.workspace = true
|
|
hex.workspace = true
|
|
hyper.workspace = true
|
|
humantime.workspace = true
|
|
once_cell.workspace = true
|
|
pageserver_api.workspace = true
|
|
pageserver_client.workspace = true
|
|
postgres_connection.workspace = true
|
|
reqwest.workspace = true
|
|
serde.workspace = true
|
|
serde_json.workspace = true
|
|
thiserror.workspace = true
|
|
tokio.workspace = true
|
|
tokio-util.workspace = true
|
|
tracing.workspace = true
|
|
|
|
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
|
|
diesel_migrations = { version = "2.1.0" }
|
|
r2d2 = { version = "0.8.10" }
|
|
|
|
utils = { path = "../../libs/utils/" }
|
|
metrics = { path = "../../libs/metrics/" }
|
|
control_plane = { path = ".." }
|
|
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
|
|