From e8395807a59478d007180a0e3943b1db937c4899 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 11 Dec 2024 19:43:40 +0000 Subject: [PATCH] storcon: allow for more concurrency in drain/fill operations (#10093) ## Problem We saw the drain/fill operations not drain fast enough in ap-southeast. ## Summary of changes These are some quick changes to speed it up: * double reconcile concurrency - this is now half of the available reconcile bandwidth * reduce the waiter polling timeout - this way we can spawn new reconciliations faster --- storage_controller/src/background_node_operations.rs | 2 +- storage_controller/src/service.rs | 6 ++++-- test_runner/regress/test_storage_controller.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 6f1355eb68..226d4942e7 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; -pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32; +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 64; #[derive(Copy, Clone)] pub(crate) struct Drain { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e82e84fe89..2600500a53 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -100,6 +100,8 @@ use crate::{ use context_iterator::TenantShardContextIterator; +const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); + // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -6798,7 +6800,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) .await; failpoint_support::sleep_millis_async!("sleepy-drain-loop", &cancel); @@ -7051,7 +7053,7 @@ impl Service { } waiters = self - .await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT) + .await_waiters_remainder(waiters, WAITER_FILL_DRAIN_POLL_TIMEOUT) .await; } diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 4d1784d45a..02da389809 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2136,7 +2136,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): env.start() tenant_count = 10 - shard_count_per_tenant = 8 + shard_count_per_tenant = 16 tenant_ids = [] for _ in range(0, tenant_count):