From f4b03ddd7b4cb858430b16d642f5f80f11e8b5b1 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 10 Jul 2025 12:18:37 +0200 Subject: [PATCH] pageserver/client_grpc: reap idle pool resources (#12476) ## Problem The gRPC client pools don't reap idle resources. Touches #11735. Requires #12475. ## Summary of changes Reap idle pool resources (channels/clients/streams) after 3 minutes of inactivity. Also restructure the `StreamPool` to use a mutex rather than atomics for synchronization, for simplicity. This will be optimized later. --- Cargo.lock | 1 + pageserver/client_grpc/Cargo.toml | 4 + pageserver/client_grpc/src/pool.rs | 241 ++++++++++++++++++++++++----- 3 files changed, 207 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index caed814d5f..4150944ad0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4506,6 +4506,7 @@ dependencies = [ "pageserver_page_api", "tokio", "tokio-stream", + "tokio-util", "tonic 0.13.1", "tracing", "utils", diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml index 84e27abb84..ca224900ac 100644 --- a/pageserver/client_grpc/Cargo.toml +++ b/pageserver/client_grpc/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +testing = ["pageserver_api/testing"] + [dependencies] anyhow.workspace = true bytes.workspace = true @@ -13,6 +16,7 @@ pageserver_api.workspace = true pageserver_page_api.workspace = true tokio.workspace = true tokio-stream.workspace = true +tokio-util.workspace = true tonic.workspace = true tracing.workspace = true utils.workspace = true diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs index 5a50004fd1..89b3bd646f 100644 --- a/pageserver/client_grpc/src/pool.rs +++ b/pageserver/client_grpc/src/pool.rs @@ -34,10 +34,12 @@ use std::num::NonZero; use std::ops::{Deref, DerefMut}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, Weak}; +use std::time::{Duration, Instant}; use futures::StreamExt as _; use tokio::sync::mpsc::{Receiver, Sender}; use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use tokio_util::sync::CancellationToken; use tonic::transport::{Channel, Endpoint}; use tracing::{error, warn}; @@ -45,6 +47,25 @@ use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; +/// Reap channels/clients/streams that have been idle for this long. +/// +/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to +/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle. +/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we +/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to +/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty +/// channels, and/or stream pool clients. +const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) { + false => Duration::from_secs(180), + true => Duration::from_secs(1), // exercise reaping in tests +}; + +/// Reap idle resources with this interval. +const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) { + false => Duration::from_secs(10), + true => Duration::from_secs(1), // exercise reaping in tests +}; + /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this. /// The pool does not limit the number of channels, and instead relies on `ClientPool` or @@ -52,7 +73,6 @@ use utils::shard::ShardIndex; /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. /// -/// TODO: reap idle channels. /// TODO: consider prewarming a set of channels, to avoid initial connection latency. /// TODO: consider adding a circuit breaker for errors and fail fast. pub struct ChannelPool { @@ -62,6 +82,8 @@ pub struct ChannelPool { max_clients_per_channel: NonZero, /// Open channels. channels: Mutex>, + /// Reaps idle channels. + idle_reaper: Reaper, /// Channel ID generator. next_channel_id: AtomicUsize, } @@ -73,6 +95,9 @@ struct ChannelEntry { channel: Channel, /// Number of clients using this channel. clients: usize, + /// The channel has been idle (no clients) since this time. None if channel is in use. + /// INVARIANT: Some if clients == 0, otherwise None. + idle_since: Option, } impl ChannelPool { @@ -82,12 +107,15 @@ impl ChannelPool { E: TryInto + Send + Sync + 'static, >::Error: std::error::Error + Send + Sync, { - Ok(Arc::new(Self { + let pool = Arc::new(Self { endpoint: endpoint.try_into()?, max_clients_per_channel, channels: Mutex::default(), + idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), next_channel_id: AtomicUsize::default(), - })) + }); + pool.idle_reaper.spawn(&pool); + Ok(pool) } /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel. @@ -116,8 +144,14 @@ impl ChannelPool { entry.clients <= self.max_clients_per_channel.get(), "channel overflow" ); + assert_eq!( + entry.idle_since.is_some(), + entry.clients == 0, + "incorrect channel idle state" + ); if entry.clients < self.max_clients_per_channel.get() { entry.clients += 1; + entry.idle_since = None; return ChannelGuard { pool: Arc::downgrade(self), id, @@ -134,6 +168,7 @@ impl ChannelPool { let entry = ChannelEntry { channel: channel.clone(), clients: 1, // account for the guard below + idle_since: None, }; channels.insert(id, entry); @@ -145,6 +180,20 @@ impl ChannelPool { } } +impl Reapable for ChannelPool { + /// Reaps channels that have been idle since before the cutoff. + fn reap_idle(&self, cutoff: Instant) { + self.channels.lock().unwrap().retain(|_, entry| { + let Some(idle_since) = entry.idle_since else { + assert_ne!(entry.clients, 0, "empty channel not marked idle"); + return true; + }; + assert_eq!(entry.clients, 0, "idle channel has clients"); + idle_since >= cutoff + }) + } +} + /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`, /// since the gRPC client requires an owned `Channel`. pub struct ChannelGuard { @@ -167,10 +216,15 @@ impl Drop for ChannelGuard { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; + let mut channels = pool.channels.lock().unwrap(); let entry = channels.get_mut(&self.id).expect("unknown channel"); + assert!(entry.idle_since.is_none(), "active channel marked idle"); assert!(entry.clients > 0, "channel underflow"); entry.clients -= 1; + if entry.clients == 0 { + entry.idle_since = Some(Instant::now()); // mark channel as idle + } } } @@ -179,8 +233,6 @@ impl Drop for ChannelGuard { /// number of concurrent clients to `max_clients` via semaphore. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. -/// -/// TODO: reap idle clients. pub struct ClientPool { /// Tenant ID. tenant_id: TenantId, @@ -201,6 +253,8 @@ pub struct ClientPool { /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle /// clients are reaped. idle: Mutex>, + /// Reaps idle clients. + idle_reaper: Reaper, /// Unique client ID generator. next_client_id: AtomicUsize, } @@ -212,6 +266,9 @@ struct ClientEntry { client: page_api::Client, /// The channel guard for the channel used by the client. channel_guard: ChannelGuard, + /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by + /// definition, so this is the time when it was added back to the pool. + idle_since: Instant, } impl ClientPool { @@ -226,16 +283,19 @@ impl ClientPool { auth_token: Option, max_clients: Option>, ) -> Arc { - Arc::new(Self { + let pool = Arc::new(Self { tenant_id, timeline_id, shard_id, auth_token, channel_pool, idle: Mutex::default(), + idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))), next_client_id: AtomicUsize::default(), - }) + }); + pool.idle_reaper.spawn(&pool); + pool } /// Gets a client from the pool, or creates a new one if necessary. Connections are established @@ -287,6 +347,16 @@ impl ClientPool { } } +impl Reapable for ClientPool { + /// Reaps clients that have been idle since before the cutoff. + fn reap_idle(&self, cutoff: Instant) { + self.idle + .lock() + .unwrap() + .retain(|_, entry| entry.idle_since >= cutoff) + } +} + /// A client acquired from the pool. The inner client can be accessed via Deref. The client is /// returned to the pool when dropped. pub struct ClientGuard { @@ -317,9 +387,11 @@ impl Drop for ClientGuard { let Some(pool) = self.pool.upgrade() else { return; // pool was dropped }; + let entry = ClientEntry { client: self.client.take().expect("dropped once"), channel_guard: self.channel_guard.take().expect("dropped once"), + idle_since: Instant::now(), }; pool.idle.lock().unwrap().insert(self.id, entry); @@ -334,7 +406,6 @@ impl Drop for ClientGuard { /// a single request and await the response. Internally, requests are multiplexed across streams and /// channels. This allows proper queue depth enforcement and response routing. /// -/// TODO: reap idle streams. /// TODO: consider making this generic over request and response types; not currently needed. pub struct StreamPool { /// The client pool to acquire clients from. Must be unbounded. @@ -344,7 +415,7 @@ pub struct StreamPool { /// Incoming requests will be sent over an existing stream with available capacity. If all /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each /// stream has an associated Tokio task that processes requests and responses. - streams: Arc>>, + streams: Mutex>, /// The max number of concurrent streams, or None if unbounded. max_streams: Option>, /// The max number of concurrent requests per stream. @@ -352,6 +423,8 @@ pub struct StreamPool { /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`. /// None if the pool is unbounded. limiter: Option>, + /// Reaps idle streams. + idle_reaper: Reaper, /// Stream ID generator. next_stream_id: AtomicUsize, } @@ -364,9 +437,11 @@ type ResponseSender = oneshot::Sender>; struct StreamEntry { /// Sends caller requests to the stream task. The stream task exits when this is dropped. sender: RequestSender, - /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on - /// completion without acquiring the `StreamPool::streams` lock. - queue_depth: Arc, + /// Number of in-flight requests on this stream. + queue_depth: usize, + /// The time when this stream went idle (queue_depth == 0). + /// INVARIANT: Some if queue_depth == 0, otherwise None. + idle_since: Option, } impl StreamPool { @@ -383,16 +458,19 @@ impl StreamPool { max_queue_depth: NonZero, ) -> Arc { assert!(client_pool.limiter.is_none(), "bounded client pool"); - Arc::new(Self { + let pool = Arc::new(Self { client_pool, - streams: Arc::default(), + streams: Mutex::default(), limiter: max_streams.map(|max_streams| { Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get())) }), max_streams, max_queue_depth, + idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL), next_stream_id: AtomicUsize::default(), - }) + }); + pool.idle_reaper.spawn(&pool); + pool } /// Acquires an available stream from the pool, or spins up a new stream async if all streams @@ -412,8 +490,8 @@ impl StreamPool { /// * Allow concurrent clients to join onto streams while they're spun up. /// * Allow spinning up multiple streams concurrently, but don't overshoot limits. /// - /// For now, we just do something simple and functional, but very inefficient (linear scan). - pub async fn get(&self) -> StreamGuard { + /// For now, we just do something simple but inefficient (linear scan under mutex). + pub async fn get(self: &Arc) -> StreamGuard { // Acquire a permit if the pool is bounded. let mut permit = None; if let Some(limiter) = self.limiter.clone() { @@ -422,23 +500,23 @@ impl StreamPool { let mut streams = self.streams.lock().unwrap(); // Look for a pooled stream with available capacity. - for entry in streams.values() { + for (&id, entry) in streams.iter_mut() { assert!( - entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(), + entry.queue_depth <= self.max_queue_depth.get(), "stream queue overflow" ); - if entry - .queue_depth - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { - // Increment the queue depth via compare-and-swap. - // TODO: review ordering. - (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1) - }) - .is_ok() - { + assert_eq!( + entry.idle_since.is_some(), + entry.queue_depth == 0, + "incorrect stream idle state" + ); + if entry.queue_depth < self.max_queue_depth.get() { + entry.queue_depth += 1; + entry.idle_since = None; return StreamGuard { + pool: Arc::downgrade(self), + id, sender: entry.sender.clone(), - queue_depth: entry.queue_depth.clone(), permit, }; } @@ -448,11 +526,11 @@ impl StreamPool { // return the guard, while spinning up the stream task async. This allows other callers to // join onto this stream and also create additional streams concurrently if this fills up. let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); - let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get()); let entry = StreamEntry { sender: req_tx.clone(), - queue_depth: queue_depth.clone(), + queue_depth: 1, // reserve quota for this caller + idle_since: None, }; streams.insert(id, entry); @@ -461,20 +539,23 @@ impl StreamPool { }; let client_pool = self.client_pool.clone(); - let streams = self.streams.clone(); + let pool = Arc::downgrade(self); tokio::spawn(async move { if let Err(err) = Self::run_stream(client_pool, req_rx).await { error!("stream failed: {err}"); } - // Remove stream from pool on exit. - let entry = streams.lock().unwrap().remove(&id); - assert!(entry.is_some(), "unknown stream ID: {id}"); + // Remove stream from pool on exit. Weak reference to avoid holding the pool alive. + if let Some(pool) = pool.upgrade() { + let entry = pool.streams.lock().unwrap().remove(&id); + assert!(entry.is_some(), "unknown stream ID: {id}"); + } }); StreamGuard { + pool: Arc::downgrade(self), + id, sender: req_tx, - queue_depth, permit, } } @@ -552,11 +633,26 @@ impl StreamPool { } } +impl Reapable for StreamPool { + /// Reaps streams that have been idle since before the cutoff. + fn reap_idle(&self, cutoff: Instant) { + self.streams.lock().unwrap().retain(|_, entry| { + let Some(idle_since) = entry.idle_since else { + assert_ne!(entry.queue_depth, 0, "empty stream not marked idle"); + return true; + }; + assert_eq!(entry.queue_depth, 0, "idle stream has requests"); + idle_since >= cutoff + }); + } +} + /// A pooled stream reference. Can be used to send a single request, to properly enforce queue /// depth. Queue depth is already reserved and will be returned on drop. pub struct StreamGuard { + pool: Weak, + id: StreamID, sender: RequestSender, - queue_depth: Arc, permit: Option, // None if pool is unbounded } @@ -588,11 +684,78 @@ impl StreamGuard { impl Drop for StreamGuard { fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + // Release the queue depth reservation on drop. This can prematurely decrement it if dropped // before the response is received, but that's okay. - let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst); - assert!(prev_queue_depth > 0, "stream queue underflow"); + let mut streams = pool.streams.lock().unwrap(); + let entry = streams.get_mut(&self.id).expect("unknown stream"); + assert!(entry.idle_since.is_none(), "active stream marked idle"); + assert!(entry.queue_depth > 0, "stream queue underflow"); + entry.queue_depth -= 1; + if entry.queue_depth == 0 { + entry.idle_since = Some(Instant::now()); // mark stream as idle + } _ = self.permit; // returned on drop, referenced for visibility } } + +/// Periodically reaps idle resources from a pool. +struct Reaper { + /// The task check interval. + interval: Duration, + /// The threshold for reaping idle resources. + threshold: Duration, + /// Cancels the reaper task. Cancelled when the reaper is dropped. + cancel: CancellationToken, +} + +impl Reaper { + /// Creates a new reaper. + pub fn new(threshold: Duration, interval: Duration) -> Self { + Self { + cancel: CancellationToken::new(), + threshold, + interval, + } + } + + /// Spawns a task to periodically reap idle resources from the given task pool. The task is + /// cancelled when the reaper is dropped. + pub fn spawn(&self, pool: &Arc) { + // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool. + let pool = Arc::downgrade(pool); + let cancel = self.cancel.clone(); + let (interval, threshold) = (self.interval, self.threshold); + + tokio::spawn(async move { + loop { + tokio::select! { + _ = tokio::time::sleep(interval) => { + let Some(pool) = pool.upgrade() else { + return; // pool was dropped + }; + pool.reap_idle(Instant::now() - threshold); + } + + _ = cancel.cancelled() => return, + } + } + }); + } +} + +impl Drop for Reaper { + fn drop(&mut self) { + self.cancel.cancel(); // cancel reaper task + } +} + +/// A reapable resource pool. +trait Reapable: Send + Sync + 'static { + /// Reaps resources that have been idle since before the given cutoff. + fn reap_idle(&self, cutoff: Instant); +}