From c18d3340b5e3c978a81c3db8b6f1e83cd9087e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 24 Apr 2024 18:48:25 +0200 Subject: [PATCH] Ability to specify the upload_storage_class in S3 bucket configuration (#7461) Currently we move data to the intended storage class via lifecycle rules, but those are a daily batch job so data first spends up to a day in standard storage. Therefore, make it possible to specify the storage class used for uploads to S3 so that the data doesn't have to be migrated automatically. The advantage of this is that it gives cleaner billing reports. Part of https://github.com/neondatabase/cloud/issues/11348 --- libs/remote_storage/src/lib.rs | 15 +++++++++++++++ libs/remote_storage/src/s3_bucket.rs | 8 +++++++- libs/remote_storage/tests/test_real_s3.rs | 1 + pageserver/src/config.rs | 1 + proxy/src/context/parquet.rs | 1 + 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 14c391ca53..32bc71c513 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -21,11 +21,13 @@ use std::{ fmt::Debug, num::{NonZeroU32, NonZeroUsize}, pin::Pin, + str::FromStr, sync::Arc, time::{Duration, SystemTime}, }; use anyhow::{bail, Context}; +use aws_sdk_s3::types::StorageClass; use camino::{Utf8Path, Utf8PathBuf}; use bytes::Bytes; @@ -563,6 +565,7 @@ pub struct S3Config { /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, pub max_keys_per_list_response: Option, + pub upload_storage_class: Option, } impl Debug for S3Config { @@ -691,6 +694,18 @@ impl RemoteStorageConfig { endpoint, concurrency_limit, max_keys_per_list_response, + upload_storage_class: toml + .get("upload_storage_class") + .map(|prefix_in_bucket| -> anyhow::Result<_> { + let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?; + let storage_class = StorageClass::from_str(&s).expect("infallible"); + #[allow(deprecated)] + if matches!(storage_class, StorageClass::Unknown(_)) { + bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values()); + } + Ok(storage_class) + }) + .transpose()?, }) } (_, _, _, Some(_), None) => { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 8091681221..c0b89cee2a 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -30,7 +30,7 @@ use aws_sdk_s3::{ config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep}, error::SdkError, operation::get_object::GetObjectError, - types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion}, + types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass}, Client, }; use aws_smithy_async::rt::sleep::TokioSleep; @@ -62,6 +62,7 @@ pub struct S3Bucket { bucket_name: String, prefix_in_bucket: Option, max_keys_per_list_response: Option, + upload_storage_class: Option, concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, @@ -154,6 +155,7 @@ impl S3Bucket { max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()), + upload_storage_class: aws_config.upload_storage_class.clone(), timeout, }) } @@ -582,6 +584,7 @@ impl RemoteStorage for S3Bucket { .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) .set_metadata(metadata.map(|m| m.0)) + .set_storage_class(self.upload_storage_class.clone()) .content_length(from_size_bytes.try_into()?) .body(bytes_stream) .send(); @@ -633,6 +636,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(copy_source) .send(); @@ -890,6 +894,7 @@ impl RemoteStorage for S3Bucket { .copy_object() .bucket(self.bucket_name.clone()) .key(key) + .set_storage_class(self.upload_storage_class.clone()) .copy_source(&source_id) .send(); @@ -1073,6 +1078,7 @@ mod tests { endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response: Some(5), + upload_storage_class: None, }; let storage = S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init"); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 01f6a532d6..a273abe867 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -380,6 +380,7 @@ fn create_s3_client( endpoint: None, concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }; diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index e10db2b853..10d5a22797 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}' endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, max_keys_per_list_response: None, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, }, diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e061216d15..9600321937 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -413,6 +413,7 @@ mod tests { ) .unwrap(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, })