storcon: skip draining shard if it's secondary is lagging too much (#8644)

## Problem Migrations of tenant shards with cold secondaries are holding up drains in during production deployments. ## Summary of changes If a secondary locations is lagging by more than 256MiB (configurable, but that's the default), then skip cutting it over to the secondary as part of the node drain.
2026-01-08 05:52:55 +00:00 · 2024-08-09 15:45:07 +01:00
parent e6770d79fd
commit f5cef7bf7f
13 changed files with 666 additions and 110 deletions
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -92,6 +92,11 @@ struct Cli {
    /// Chaos testing
    #[arg(long)]
    chaos_interval: Option<humantime::Duration>,
+
+    // Maximum acceptable lag for the secondary location while draining
+    // a pageserver
+    #[arg(long)]
+    max_secondary_lag_bytes: Option<u64>,
 }

 enum StrictMode {
@@ -279,6 +284,7 @@ async fn async_main() -> anyhow::Result<()> {
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
+        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations