[proxy] replace the batch cancellation queue, shorten the TTL for cancel keys (#11943)

See #11942 

Idea: 
* if connections are short lived, they can get enqueued and then also
remove themselves later if they never made it to redis. This reduces the
load on the queue.
* short lived connections (<10m, most?) will only issue 1 command, we
remove the delete command and rely on ttl.
* we can enqueue as many commands as we want, as we can always cancel
the enqueue, thanks to the ~~intrusive linked lists~~ `BTreeMap`.
This commit is contained in:
Conrad Ludgate
2025-06-20 12:48:01 +01:00
committed by GitHub
parent 8b197de7ff
commit a298d2c29b
16 changed files with 410 additions and 427 deletions

View File

@@ -201,7 +201,7 @@ pub async fn run() -> anyhow::Result<()> {
auth_backend,
http_listener,
shutdown.clone(),
Arc::new(CancellationHandler::new(&config.connect_to_compute, None)),
Arc::new(CancellationHandler::new(&config.connect_to_compute)),
endpoint_rate_limiter,
);

View File

@@ -23,7 +23,8 @@ use utils::{project_build_tag, project_git_version};
use crate::auth::backend::jwt::JwkCache;
use crate::auth::backend::{ConsoleRedirectBackend, MaybeOwned};
use crate::cancellation::{CancellationHandler, handle_cancel_messages};
use crate::batch::BatchQueue;
use crate::cancellation::{CancellationHandler, CancellationProcessor};
use crate::config::{
self, AuthenticationConfig, CacheOptions, ComputeConfig, HttpConfig, ProjectInfoCacheOptions,
ProxyConfig, ProxyProtocolV2, remote_storage_from_toml,
@@ -392,13 +393,7 @@ pub async fn run() -> anyhow::Result<()> {
.as_ref()
.map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit));
// channel size should be higher than redis client limit to avoid blocking
let cancel_ch_size = args.cancellation_ch_size;
let (tx_cancel, rx_cancel) = tokio::sync::mpsc::channel(cancel_ch_size);
let cancellation_handler = Arc::new(CancellationHandler::new(
&config.connect_to_compute,
Some(tx_cancel),
));
let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute));
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards(
RateBucketInfo::to_leaky_bucket(&args.endpoint_rps_limit)
@@ -530,21 +525,11 @@ pub async fn run() -> anyhow::Result<()> {
match redis_kv_client.try_connect().await {
Ok(()) => {
info!("Connected to Redis KV client");
maintenance_tasks.spawn(async move {
handle_cancel_messages(
&mut redis_kv_client,
rx_cancel,
args.cancellation_batch_size,
)
.await?;
cancellation_handler.init_tx(BatchQueue::new(CancellationProcessor {
client: redis_kv_client,
batch_size: args.cancellation_batch_size,
}));
drop(redis_kv_client);
// `handle_cancel_messages` was terminated due to the tx_cancel
// being dropped. this is not worthy of an error, and this task can only return `Err`,
// so let's wait forever instead.
std::future::pending().await
});
break;
}
Err(e) => {