From 9c30883c4baf629e9adead0457322189e6833001 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 9 Nov 2023 13:50:13 +0000 Subject: [PATCH] remote_storage: use S3 SDK's adaptive retry policy (#5813) ## Problem Currently, we aren't doing any explicit slowdown in response to 429 responses. Recently, as we hit remote storage a bit harder (pageserver does more ListObjectsv2 requests than it used to since #5580 ), we're seeing storms of 429 responses that may be the result of not just doing too may requests, but continuing to do those extra requests without backing off any more than our usual backoff::exponential. ## Summary of changes Switch from AWS's "Standard" retry policy to "Adaptive" -- docs describe this as experimental but it has been around for a long time. The main difference between Standard and Adaptive is that Adaptive rate-limits the client in response to feedback from the server, which is meant to avoid scenarios where the client would otherwise repeatedly hit throttling responses. --- Cargo.lock | 1 + Cargo.toml | 1 + libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/s3_bucket.rs | 27 ++++++++++++++++++++++----- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5e7fce3e8d..0a434c6ee6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4064,6 +4064,7 @@ dependencies = [ "aws-config", "aws-credential-types", "aws-sdk-s3", + "aws-smithy-async", "aws-smithy-http", "aws-types", "azure_core", diff --git a/Cargo.toml b/Cargo.toml index 6be831a2b3..5b719d776b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ async-trait = "0.1" aws-config = { version = "0.56", default-features = false, features=["rustls"] } aws-sdk-s3 = "0.29" aws-smithy-http = "0.56" +aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] } aws-credential-types = "0.56" aws-types = "0.56" axum = { version = "0.6.20", features = ["ws"] } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 65b5392389..d7bcce28cb 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true anyhow.workspace = true async-trait.workspace = true once_cell.workspace = true +aws-smithy-async.workspace = true aws-smithy-http.workspace = true aws-types.workspace = true aws-config.workspace = true diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 560a2c14e9..ab3fd3fe62 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,23 +4,27 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::borrow::Cow; +use std::{borrow::Cow, sync::Arc}; use anyhow::Context; use aws_config::{ environment::credentials::EnvironmentVariableCredentialsProvider, - imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain, - provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider, + imds::credentials::ImdsCredentialsProvider, + meta::credentials::CredentialsProviderChain, + provider_config::ProviderConfig, + retry::{RetryConfigBuilder, RetryMode}, + web_identity_token::WebIdentityTokenCredentialsProvider, }; use aws_credential_types::cache::CredentialsCache; use aws_sdk_s3::{ - config::{Config, Region}, + config::{AsyncSleep, Config, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, primitives::ByteStream, types::{Delete, ObjectIdentifier}, Client, }; +use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_http::body::SdkBody; use hyper::Body; use scopeguard::ScopeGuard; @@ -83,10 +87,23 @@ impl S3Bucket { .or_else("imds", ImdsCredentialsProvider::builder().build()) }; + // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off + let sleep_impl: Arc = Arc::new(TokioSleep::new()); + + // We do our own retries (see [`backoff::retry`]). However, for the AWS SDK to enable rate limiting in response to throttling + // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config. We set it to use at most one + // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled. + let mut retry_config = RetryConfigBuilder::new(); + retry_config + .set_max_attempts(Some(1)) + .set_mode(Some(RetryMode::Adaptive)); + let mut config_builder = Config::builder() .region(region) .credentials_cache(CredentialsCache::lazy()) - .credentials_provider(credentials_provider); + .credentials_provider(credentials_provider) + .sleep_impl(SharedAsyncSleep::from(sleep_impl)) + .retry_config(retry_config.build()); if let Some(custom_endpoint) = aws_config.endpoint.clone() { config_builder = config_builder