From 80dcdfa8bfece71789f8523a4ed97bcf94d8306b Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 11 Oct 2023 11:47:19 +0300 Subject: [PATCH 01/18] Update pgvector to 0.5.1 (#5525) --- Dockerfile.compute-node | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 120520208c..36c3f874d4 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \ - echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ + echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ From 653044f7547bb61f98b2af53b551a3ed678c45ed Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 11 Oct 2023 10:49:15 +0100 Subject: [PATCH 02/18] test_runners: increase some timeouts to make tests less flaky (#5521) ## Problem - `test_heavy_write_workload` is flaky, and fails because of to statement timeout - `test_wal_lagging` is flaky and fails because of the default pytest timeout (see https://github.com/neondatabase/neon/issues/5305) ## Summary of changes - `test_heavy_write_workload`: increase statement timeout to 5 minutes (from default 2 minutes) - `test_wal_lagging`: increase pytest timeout to 600s (from default 300s) --- test_runner/performance/test_wal_backpressure.py | 2 +- test_runner/regress/test_wal_acceptor_async.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 3939ca30b6..3cb4b667ff 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -65,7 +65,7 @@ def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_it def start_single_table_workload(table_id: int): for _ in range(num_iters): - with env.pg.connect().cursor() as cur: + with env.pg.connect(options="-cstatement_timeout=300s").cursor() as cur: cur.execute( f"INSERT INTO t{table_id} SELECT FROM generate_series(1,{new_rows_each_update})" ) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index cfc131a3aa..81a2411271 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import List, Optional import asyncpg +import pytest import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper @@ -597,7 +598,10 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat assert res == expected_sum -# do inserts while restarting postgres and messing with safekeeper addresses +# Do inserts while restarting postgres and messing with safekeeper addresses. +# The test takes more than default 5 minutes on Postgres 16, +# see https://github.com/neondatabase/neon/issues/5305 +@pytest.mark.timeout(600) def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() From 39e144696f40ac237e393e5e3978c5206d9d8571 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Oct 2023 12:50:16 +0100 Subject: [PATCH 03/18] pageserver: clean up `mgr.rs` types that needn't be public (#5529) ## Problem These types/functions are public and it prevents clippy from catching unused things. ## Summary of changes Move to `pub(crate)` and remove the error enum that becomes clearly unused as a result. --- pageserver/src/http/routes.rs | 3 --- pageserver/src/tenant/delete.rs | 4 ++-- pageserver/src/tenant/mgr.rs | 40 ++++++++++++++++----------------- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c42bd956f3..9c07340a6f 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -164,9 +164,6 @@ impl From for ApiError { fn from(tse: TenantStateError) -> ApiError { match tse { TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), - TenantStateError::NotActive(_) => { - ApiError::ResourceUnavailable("Tenant not yet active".into()) - } TenantStateError::IsStopping(_) => { ApiError::ResourceUnavailable("Tenant is stopping".into()) } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index d8763b0a64..3cb58aec6b 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -31,7 +31,7 @@ use super::{ const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3; #[derive(Debug, thiserror::Error)] -pub enum DeleteTenantError { +pub(crate) enum DeleteTenantError { #[error("GetTenant {0}")] Get(#[from] GetTenantError), @@ -376,7 +376,7 @@ impl DeleteTenantFlow { Ok(()) } - pub async fn should_resume_deletion( + pub(crate) async fn should_resume_deletion( conf: &'static PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant: &Tenant, diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 35b3be6d61..83ba49e1f5 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -50,7 +50,7 @@ use super::TenantSharedResources; /// its lifetime, and we can preserve some important safety invariants like `Tenant` always /// having a properly acquired generation (Secondary doesn't need a generation) #[derive(Clone)] -pub enum TenantSlot { +pub(crate) enum TenantSlot { Attached(Arc), Secondary, } @@ -481,7 +481,7 @@ pub(crate) fn schedule_local_tenant_processing( /// management API. For example, it could attach the tenant on a different pageserver. /// We would then be in split-brain once this pageserver restarts. #[instrument(skip_all)] -pub async fn shutdown_all_tenants() { +pub(crate) async fn shutdown_all_tenants() { shutdown_all_tenants0(&TENANTS).await } @@ -593,7 +593,7 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock) { // caller will log how long we took } -pub async fn create_tenant( +pub(crate) async fn create_tenant( conf: &'static PageServerConf, tenant_conf: TenantConfOpt, tenant_id: TenantId, @@ -628,14 +628,14 @@ pub async fn create_tenant( } #[derive(Debug, thiserror::Error)] -pub enum SetNewTenantConfigError { +pub(crate) enum SetNewTenantConfigError { #[error(transparent)] GetTenant(#[from] GetTenantError), #[error(transparent)] Persist(anyhow::Error), } -pub async fn set_new_tenant_config( +pub(crate) async fn set_new_tenant_config( conf: &'static PageServerConf, new_tenant_conf: TenantConfOpt, tenant_id: TenantId, @@ -776,7 +776,7 @@ pub(crate) async fn upsert_location( } #[derive(Debug, thiserror::Error)] -pub enum GetTenantError { +pub(crate) enum GetTenantError { #[error("Tenant {0} not found")] NotFound(TenantId), #[error("Tenant {0} is not active")] @@ -792,7 +792,7 @@ pub enum GetTenantError { /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants. /// /// This method is cancel-safe. -pub async fn get_tenant( +pub(crate) async fn get_tenant( tenant_id: TenantId, active_only: bool, ) -> Result, GetTenantError> { @@ -817,7 +817,7 @@ pub async fn get_tenant( } } -pub async fn delete_tenant( +pub(crate) async fn delete_tenant( conf: &'static PageServerConf, remote_storage: Option, tenant_id: TenantId, @@ -826,7 +826,7 @@ pub async fn delete_tenant( } #[derive(Debug, thiserror::Error)] -pub enum DeleteTimelineError { +pub(crate) enum DeleteTimelineError { #[error("Tenant {0}")] Tenant(#[from] GetTenantError), @@ -834,7 +834,7 @@ pub enum DeleteTimelineError { Timeline(#[from] crate::tenant::DeleteTimelineError), } -pub async fn delete_timeline( +pub(crate) async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, _ctx: &RequestContext, @@ -845,18 +845,16 @@ pub async fn delete_timeline( } #[derive(Debug, thiserror::Error)] -pub enum TenantStateError { +pub(crate) enum TenantStateError { #[error("Tenant {0} not found")] NotFound(TenantId), #[error("Tenant {0} is stopping")] IsStopping(TenantId), - #[error("Tenant {0} is not active")] - NotActive(TenantId), #[error(transparent)] Other(#[from] anyhow::Error), } -pub async fn detach_tenant( +pub(crate) async fn detach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, detach_ignored: bool, @@ -926,7 +924,7 @@ async fn detach_tenant0( removal_result } -pub async fn load_tenant( +pub(crate) async fn load_tenant( conf: &'static PageServerConf, tenant_id: TenantId, generation: Generation, @@ -963,7 +961,7 @@ pub async fn load_tenant( Ok(()) } -pub async fn ignore_tenant( +pub(crate) async fn ignore_tenant( conf: &'static PageServerConf, tenant_id: TenantId, ) -> Result<(), TenantStateError> { @@ -991,7 +989,7 @@ async fn ignore_tenant0( } #[derive(Debug, thiserror::Error)] -pub enum TenantMapListError { +pub(crate) enum TenantMapListError { #[error("tenant map is still initiailizing")] Initializing, } @@ -999,7 +997,7 @@ pub enum TenantMapListError { /// /// Get list of tenants, for the mgmt API /// -pub async fn list_tenants() -> Result, TenantMapListError> { +pub(crate) async fn list_tenants() -> Result, TenantMapListError> { let tenants = TENANTS.read().await; let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), @@ -1017,7 +1015,7 @@ pub async fn list_tenants() -> Result, TenantMapLis /// /// Downloading all the tenant data is performed in the background, this merely /// spawns the background task and returns quickly. -pub async fn attach_tenant( +pub(crate) async fn attach_tenant( conf: &'static PageServerConf, tenant_id: TenantId, generation: Generation, @@ -1054,7 +1052,7 @@ pub async fn attach_tenant( } #[derive(Debug, thiserror::Error)] -pub enum TenantMapInsertError { +pub(crate) enum TenantMapInsertError { #[error("tenant map is still initializing")] StillInitializing, #[error("tenant map is shutting down")] @@ -1217,7 +1215,7 @@ use { utils::http::error::ApiError, }; -pub async fn immediate_gc( +pub(crate) async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, gc_req: TimelineGcRequest, From e0c8ad48d4e7aee5a6dc71a12975ad8aacbfa7d7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Oct 2023 13:22:00 +0100 Subject: [PATCH 04/18] remote_storage: log detail errors in delete_objects (#5530) ## Problem When we got an error in the payload of a DeleteObjects response, we only logged how many errors, not what they were. ## Summary of changes Log up to 10 specific errors. We do not log all of them because that would be up to 1000 log lines per request. --- libs/remote_storage/src/s3_bucket.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index db7eef91e2..c63f24cf85 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,7 +4,7 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::sync::Arc; +use std::{borrow::Cow, sync::Arc}; use anyhow::Context; use aws_config::{ @@ -556,6 +556,20 @@ impl RemoteStorage for S3Bucket { .deleted_objects_total .inc_by(chunk.len() as u64); if let Some(errors) = resp.errors { + // Log a bounded number of the errors within the response: + // these requests can carry 1000 keys so logging each one + // would be too verbose, especially as errors may lead us + // to retry repeatedly. + const LOG_UP_TO_N_ERRORS: usize = 10; + for e in errors.iter().take(LOG_UP_TO_N_ERRORS) { + tracing::warn!( + "DeleteObjects key {} failed: {}: {}", + e.key.as_ref().map(Cow::from).unwrap_or("".into()), + e.code.as_ref().map(Cow::from).unwrap_or("".into()), + e.message.as_ref().map(Cow::from).unwrap_or("".into()) + ); + } + return Err(anyhow::format_err!( "Failed to delete {} objects", errors.len() From 0fc3708de248afd3a1e43798df7c4c551a6a0b45 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 11 Oct 2023 15:25:08 +0100 Subject: [PATCH 05/18] pageserver: use a backoff::retry in Deleter (#5534) ## Problem The `Deleter` currently doesn't use a backoff::retry because it doesn't need to: it is already inside a loop when doing the deletion, so can just let the loop go around. However, this is a problem for logging, because we log on errors, which includes things like 503/429 cases that would usually be swallowed by a backoff::retry in most places we use the RemoteStorage interface. The underlying problem is that RemoteStorage doesn't have a proper error type, and an anyhow::Error can't easily be interrogated for its original S3 SdkError because downcast_ref requires a concrete type, but SdkError is parametrized on response type. ## Summary of changes Wrap remote deletions in Deleter in a backoff::retry to avoid logging warnings on transient 429/503 conditions, and for symmetry with how RemoteStorage is used in other places. --- pageserver/src/deletion_queue/deleter.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/pageserver/src/deletion_queue/deleter.rs b/pageserver/src/deletion_queue/deleter.rs index 5c6e7dc9d7..81cb016d49 100644 --- a/pageserver/src/deletion_queue/deleter.rs +++ b/pageserver/src/deletion_queue/deleter.rs @@ -13,6 +13,7 @@ use std::time::Duration; use tokio_util::sync::CancellationToken; use tracing::info; use tracing::warn; +use utils::backoff; use crate::metrics; @@ -63,7 +64,19 @@ impl Deleter { Err(anyhow::anyhow!("failpoint hit")) }); - self.remote_storage.delete_objects(&self.accumulator).await + // A backoff::retry is used here for two reasons: + // - To provide a backoff rather than busy-polling the API on errors + // - To absorb transient 429/503 conditions without hitting our error + // logging path for issues deleting objects. + backoff::retry( + || async { self.remote_storage.delete_objects(&self.accumulator).await }, + |_| false, + 3, + 10, + "executing deletion batch", + backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")), + ) + .await } /// Block until everything in accumulator has been executed @@ -88,7 +101,10 @@ impl Deleter { self.accumulator.clear(); } Err(e) => { - warn!("DeleteObjects request failed: {e:#}, will retry"); + if self.cancel.is_cancelled() { + return Err(DeletionQueueError::ShuttingDown); + } + warn!("DeleteObjects request failed: {e:#}, will continue trying"); metrics::DELETION_QUEUE .remote_errors .with_label_values(&["execute"]) From ddceb9e6cd01574b4bea539ecd891d808e8650fa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 11 Oct 2023 18:24:36 +0300 Subject: [PATCH 06/18] fix(branching): read last record lsn only after Tenant::gc_cs (#5535) Fixes #5531, at least the latest error of not being able to create a branch from the head under write and gc pressure. --- pageserver/src/tenant.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 264f8a1ee0..b79169fd8e 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -209,7 +209,7 @@ pub struct Tenant { /// The remote storage generation, used to protect S3 objects from split-brain. /// Does not change over the lifetime of the [`Tenant`] object. - /// + /// /// This duplicates the generation stored in LocationConf, but that structure is mutable: /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime. generation: Generation, @@ -2755,6 +2755,11 @@ impl Tenant { ) -> Result, CreateTimelineError> { let src_id = src_timeline.timeline_id; + // First acquire the GC lock so that another task cannot advance the GC + // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are + // creating the branch. + let _gc_cs = self.gc_cs.lock().await; + // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN let start_lsn = start_lsn.unwrap_or_else(|| { let lsn = src_timeline.get_last_record_lsn(); @@ -2762,11 +2767,6 @@ impl Tenant { lsn }); - // First acquire the GC lock so that another task cannot advance the GC - // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are - // creating the branch. - let _gc_cs = self.gc_cs.lock().await; - // Create a placeholder for the new branch. This will error // out if the new timeline ID is already in use. let timeline_uninit_mark = { From dbb21d6592882f5cc1616f566ea7dc8cba35c238 Mon Sep 17 00:00:00 2001 From: khanova <32508607+khanova@users.noreply.github.com> Date: Thu, 12 Oct 2023 11:41:07 +0200 Subject: [PATCH 07/18] Make http timeout configurable (#5532) ## Problem Currently http timeout is hardcoded to 15 seconds. ## Summary of changes Added an option to configure it via cli args. Context: https://neondb.slack.com/archives/C04DGM6SMTM/p1696941726151899 --- proxy/src/bin/proxy.rs | 9 ++++++++- proxy/src/config.rs | 5 +++++ proxy/src/http/sql_over_http.rs | 7 ++++--- proxy/src/http/websocket.rs | 9 ++++++++- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 6b46eaddfa..6bccdf6e5a 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,5 +1,6 @@ use futures::future::Either; use proxy::auth; +use proxy::config::HttpConfig; use proxy::console; use proxy::http; use proxy::metrics; @@ -79,6 +80,9 @@ struct ProxyCliArgs { /// Allow self-signed certificates for compute nodes (for testing) #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] allow_self_signed_compute: bool, + /// timeout for http connections + #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] + sql_over_http_timeout: tokio::time::Duration, } #[tokio::main] @@ -220,12 +224,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { auth::BackendType::Link(Cow::Owned(url)) } }; - + let http_config = HttpConfig { + sql_over_http_timeout: args.sql_over_http_timeout, + }; let config = Box::leak(Box::new(ProxyConfig { tls_config, auth_backend, metric_collection, allow_self_signed_compute: args.allow_self_signed_compute, + http_config, })); Ok(config) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 989027f03f..b37c1736bd 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -13,6 +13,7 @@ pub struct ProxyConfig { pub auth_backend: auth::BackendType<'static, ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, + pub http_config: HttpConfig, } #[derive(Debug)] @@ -26,6 +27,10 @@ pub struct TlsConfig { pub common_names: Option>, } +pub struct HttpConfig { + pub sql_over_http_timeout: tokio::time::Duration, +} + impl TlsConfig { pub fn to_server_config(&self) -> Arc { self.config.clone() diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index fbf19bcb50..63d7d807e9 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -24,6 +24,7 @@ use url::Url; use utils::http::error::ApiError; use utils::http::json::json_response; +use crate::config::HttpConfig; use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER}; use super::conn_pool::ConnInfo; @@ -49,7 +50,6 @@ enum Payload { const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB -const HTTP_CONNECTION_TIMEOUT: tokio::time::Duration = tokio::time::Duration::from_secs(15); static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); @@ -191,9 +191,10 @@ pub async fn handle( sni_hostname: Option, conn_pool: Arc, session_id: uuid::Uuid, + config: &'static HttpConfig, ) -> Result, ApiError> { let result = tokio::time::timeout( - HTTP_CONNECTION_TIMEOUT, + config.sql_over_http_timeout, handle_inner(request, sni_hostname, conn_pool, session_id), ) .await; @@ -224,7 +225,7 @@ pub async fn handle( Err(_) => { let message = format!( "HTTP-Connection timed out, execution time exeeded {} seconds", - HTTP_CONNECTION_TIMEOUT.as_secs() + config.sql_over_http_timeout.as_secs() ); error!(message); json_response( diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index ddebbccabc..656635b299 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -205,7 +205,14 @@ async fn ws_handler( // TODO: that deserves a refactor as now this function also handles http json client besides websockets. // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead. } else if request.uri().path() == "/sql" && request.method() == Method::POST { - sql_over_http::handle(request, sni_hostname, conn_pool, session_id).await + sql_over_http::handle( + request, + sni_hostname, + conn_pool, + session_id, + &config.http_config, + ) + .await } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { Response::builder() .header("Allow", "OPTIONS, POST") From 21deb81acbb0516e2f8d3934f40f951b8c7d3345 Mon Sep 17 00:00:00 2001 From: khanova <32508607+khanova@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:32:49 +0200 Subject: [PATCH 08/18] Fix case for array of jsons (#5523) ## Problem Currently proxy doesn't handle array of json parameters correctly. ## Summary of changes Added one more level of quotes escaping for the array of jsons case. Resolves: https://github.com/neondatabase/neon/issues/5515 --- proxy/src/http/sql_over_http.rs | 31 +++++++++++++++++++++++++++++-- test_runner/regress/test_proxy.py | 8 +++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index 63d7d807e9..02ce3aa9dc 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -102,9 +102,9 @@ fn json_array_to_pg_array(value: &Value) -> Result, serde_json::E // convert to text with escaping Value::Bool(_) => serde_json::to_string(value).map(Some), Value::Number(_) => serde_json::to_string(value).map(Some), - Value::Object(_) => serde_json::to_string(value).map(Some), // here string needs to be escaped, as it is part of the array + Value::Object(_) => json_array_to_pg_array(&Value::String(serde_json::to_string(value)?)), Value::String(_) => serde_json::to_string(value).map(Some), // recurse into array @@ -613,7 +613,7 @@ fn _pg_array_parse( } } } - '}' => { + '}' if !quote => { level -= 1; if level == 0 { push_checked(&mut entry, &mut entries, elem_type)?; @@ -697,6 +697,14 @@ mod tests { "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned() )] ); + // array of objects + let json = r#"[{"foo": 1},{"bar": 2}]"#; + let json: Value = serde_json::from_str(json).unwrap(); + let pg_params = json_to_pg_text(vec![json]).unwrap(); + assert_eq!( + pg_params, + vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())] + ); } #[test] @@ -824,4 +832,23 @@ mod tests { json!([[[1, 2, 3], [4, 5, 6]]]) ); } + #[test] + fn test_pg_array_parse_json() { + fn pt(pg_arr: &str) -> Value { + pg_array_parse(pg_arr, &Type::JSONB).unwrap() + } + assert_eq!(pt(r#"{"{}"}"#), json!([{}])); + assert_eq!( + pt(r#"{"{\"foo\": 1, \"bar\": 2}"}"#), + json!([{"foo": 1, "bar": 2}]) + ); + assert_eq!( + pt(r#"{"{\"foo\": 1}", "{\"bar\": 2}"}"#), + json!([{"foo": 1}, {"bar": 2}]) + ); + assert_eq!( + pt(r#"{{"{\"foo\": 1}", "{\"bar\": 2}"}}"#), + json!([[{"foo": 1}, {"bar": 2}]]) + ); + } } diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index c542ab05ae..f57b15c9b9 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -188,7 +188,7 @@ def test_sql_over_http(static_proxy: NeonProxy): headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr}, verify=str(static_proxy.test_output_dir / "proxy.crt"), ) - assert response.status_code == 200 + assert response.status_code == 200, response.text return response.json() rows = q("select 42 as answer")["rows"] @@ -206,6 +206,12 @@ def test_sql_over_http(static_proxy: NeonProxy): rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"] assert rows == [{"answer": {"b": 42}}] + rows = q("select $1::jsonb[] as answer", [[{}]])["rows"] + assert rows == [{"answer": [{}]}] + + rows = q("select $1::jsonb[] as answer", [[{"foo": 1}, {"bar": 2}]])["rows"] + assert rows == [{"answer": [{"foo": 1}, {"bar": 2}]}] + rows = q("select * from pg_class limit 1")["rows"] assert len(rows) == 1 From dd6990567fadc34f12afab8d617fda9429736a3f Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 13 Oct 2023 16:08:23 +0200 Subject: [PATCH 09/18] walredo: apply_batch_postgres: get a backtrace whenever it encounters an error (#5541) For 2 weeks we've seen rare, spurious, not-reproducible page reconstruction failures with PG16 in prod. One of the commits we deployed this week was Commit commit fc467941f965f584450eb50ca410c8e944651f2f Author: Joonas Koivunen Date: Wed Oct 4 16:19:19 2023 +0300 walredo: log retryed error (#546) With the logs from that commit, we learned that some read() or write() system call that walredo does fails with `EAGAIN`, aka `Resource temporarily unavailable (os error 11)`. But we have no idea where exactly in the code we get back that error. So, use anyhow instead of fake std::io::Error's as an easy way to get a backtrace when the error happens, and change the logging to print that backtrace (i.e., use `{:?}` instead of `utils::error::report_compact_sources(e)`). The `WalRedoError` type had to go because we add additional `.context()` further up the call chain before we `{:?}`-print it. That additional `.context()` further up doesn't see that there's already an anyhow::Error inside the `WalRedoError::ApplyWalRecords` variant, and hence captures another backtrace and prints that one on `{:?}`-print instead of the original one inside `WalRedoError::ApplyWalRecords`. If we ever switch back to `report_compact_sources`, we should make sure we have some other way to uniquely identify the places where we return an error in the error message. --- pageserver/benches/bench_walredo.rs | 11 ++-- pageserver/src/http/routes.rs | 4 +- pageserver/src/tenant.rs | 9 +-- pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/walredo.rs | 95 ++++++++++------------------- 5 files changed, 40 insertions(+), 81 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 6cda86fafa..9bcd3fa708 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -11,10 +11,7 @@ use std::sync::{Arc, Barrier}; use bytes::{Buf, Bytes}; use pageserver::{ - config::PageServerConf, - repository::Key, - walrecord::NeonWalRecord, - walredo::{PostgresRedoManager, WalRedoError}, + config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, }; use utils::{id::TenantId, lsn::Lsn}; @@ -152,7 +149,7 @@ impl Drop for JoinOnDrop { } } -fn execute_all(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError> +fn execute_all(input: I, manager: &PostgresRedoManager) -> anyhow::Result<()> where I: IntoIterator, { @@ -160,7 +157,7 @@ where input.into_iter().try_for_each(|req| { let page = req.execute(manager)?; assert_eq!(page.remaining(), 8192); - Ok::<_, WalRedoError>(()) + anyhow::Ok(()) }) } @@ -473,7 +470,7 @@ struct Request { } impl Request { - fn execute(self, manager: &PostgresRedoManager) -> Result { + fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result { use pageserver::walredo::WalRedoManager; let Request { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9c07340a6f..38304eb16b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -136,9 +136,7 @@ impl From for ApiError { PageReconstructError::AncestorStopping(_) => { ApiError::ResourceUnavailable(format!("{pre}").into()) } - PageReconstructError::WalRedo(pre) => { - ApiError::InternalServerError(anyhow::Error::new(pre)) - } + PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre), } } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b79169fd8e..8f7710ffcf 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3448,11 +3448,8 @@ pub mod harness { use crate::deletion_queue::mock::MockDeletionQueue; use crate::{ - config::PageServerConf, - repository::Key, - tenant::Tenant, - walrecord::NeonWalRecord, - walredo::{WalRedoError, WalRedoManager}, + config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord, + walredo::WalRedoManager, }; use super::*; @@ -3625,7 +3622,7 @@ pub mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, - ) -> Result { + ) -> anyhow::Result { let s = format!( "redo for {} to get to {}, with {} and {} records", key, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4dddb7b2fd..85b33b98b9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -370,7 +370,7 @@ pub enum PageReconstructError { /// An error happened replaying WAL records #[error(transparent)] - WalRedo(#[from] crate::walredo::WalRedoError), + WalRedo(anyhow::Error), } impl std::fmt::Debug for PageReconstructError { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 23433fa694..fc569c474a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -18,6 +18,7 @@ //! any WAL records, so that even if an attacker hijacks the Postgres //! process, he cannot escape out of it. //! +use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; @@ -89,7 +90,7 @@ pub trait WalRedoManager: Send + Sync { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, - ) -> Result; + ) -> anyhow::Result; } struct ProcessInput { @@ -140,20 +141,6 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool { } } -/// An error happened in WAL redo -#[derive(Debug, thiserror::Error)] -pub enum WalRedoError { - #[error(transparent)] - IoError(#[from] std::io::Error), - - #[error("cannot perform WAL redo now")] - InvalidState, - #[error("cannot perform WAL redo for this request")] - InvalidRequest, - #[error("cannot perform WAL redo for this record")] - InvalidRecord, -} - /// /// Public interface of WAL redo manager /// @@ -171,10 +158,9 @@ impl WalRedoManager for PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, - ) -> Result { + ) -> anyhow::Result { if records.is_empty() { - error!("invalid WAL redo request with no records"); - return Err(WalRedoError::InvalidRequest); + anyhow::bail!("invalid WAL redo request with no records"); } let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); @@ -260,8 +246,8 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, - ) -> Result { - let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; + ) -> anyhow::Result { + let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let start_time = Instant::now(); let mut n_attempts = 0u32; @@ -271,7 +257,8 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if proc.is_none() { - self.launch(&mut proc, pg_version)?; + self.launch(&mut proc, pg_version) + .context("launch process")?; } WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); @@ -279,7 +266,7 @@ impl PostgresRedoManager { let buf_tag = BufferTag { rel, blknum }; let result = self .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout) - .map_err(WalRedoError::IoError); + .context("apply_wal_records"); let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); @@ -309,15 +296,15 @@ impl PostgresRedoManager { // next request will launch a new one. if let Err(e) = result.as_ref() { error!( - n_attempts, - "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}: {}", + "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", records.len(), records.first().map(|p| p.0).unwrap_or(Lsn(0)), records.last().map(|p| p.0).unwrap_or(Lsn(0)), nbytes, base_img_lsn, lsn, - utils::error::report_compact_sources(e), + n_attempts, + e, ); // self.stdin only holds stdin & stderr as_raw_fd(). // Dropping it as part of take() doesn't close them. @@ -354,7 +341,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option, records: &[(Lsn, NeonWalRecord)], - ) -> Result { + ) -> anyhow::Result { let start_time = Instant::now(); let mut page = BytesMut::new(); @@ -363,8 +350,7 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - error!("invalid neon WAL redo request with no base image"); - return Err(WalRedoError::InvalidRequest); + anyhow::bail!("invalid neon WAL redo request with no base image"); } // Apply all the WAL records in the batch @@ -392,14 +378,13 @@ impl PostgresRedoManager { page: &mut BytesMut, _record_lsn: Lsn, record: &NeonWalRecord, - ) -> Result<(), WalRedoError> { + ) -> anyhow::Result<()> { match record { NeonWalRecord::Postgres { will_init: _, rec: _, } => { - error!("tried to pass postgres wal record to neon WAL redo"); - return Err(WalRedoError::InvalidRequest); + anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); } NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, @@ -407,7 +392,7 @@ impl PostgresRedoManager { flags, } => { // sanity check that this is modifying the correct relation - let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; + let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, "ClearVisibilityMapFlags record on unexpected rel {}", @@ -445,7 +430,7 @@ impl PostgresRedoManager { // same effects as the corresponding Postgres WAL redo function. NeonWalRecord::ClogSetCommitted { xids, timestamp } => { let (slru_kind, segno, blknum) = - key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + key_to_slru_block(key).context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, @@ -495,7 +480,7 @@ impl PostgresRedoManager { } NeonWalRecord::ClogSetAborted { xids } => { let (slru_kind, segno, blknum) = - key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + key_to_slru_block(key).context("invalid record")?; assert_eq!( slru_kind, SlruKind::Clog, @@ -526,7 +511,7 @@ impl PostgresRedoManager { } NeonWalRecord::MultixactOffsetCreate { mid, moff } => { let (slru_kind, segno, blknum) = - key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + key_to_slru_block(key).context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactOffsets, @@ -559,7 +544,7 @@ impl PostgresRedoManager { } NeonWalRecord::MultixactMembersCreate { moff, members } => { let (slru_kind, segno, blknum) = - key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + key_to_slru_block(key).context("invalid record")?; assert_eq!( slru_kind, SlruKind::MultiXactMembers, @@ -759,7 +744,7 @@ impl PostgresRedoManager { base_img: &Option, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, - ) -> Result { + ) -> anyhow::Result { // Serialize all the messages to send the WAL redo process first. // // This could be problematic if there are millions of records to replay, @@ -782,10 +767,7 @@ impl PostgresRedoManager { { build_apply_record_msg(*lsn, postgres_rec, &mut writebuf); } else { - return Err(Error::new( - ErrorKind::Other, - "tried to pass neon wal record to postgres WAL redo", - )); + anyhow::bail!("tried to pass neon wal record to postgres WAL redo"); } } build_get_page_msg(tag, &mut writebuf); @@ -807,7 +789,7 @@ impl PostgresRedoManager { writebuf: &[u8], mut input: MutexGuard>, wal_redo_timeout: Duration, - ) -> Result { + ) -> anyhow::Result { let proc = input.as_mut().unwrap(); let mut nwrite = 0usize; let stdout_fd = proc.stdout_fd; @@ -831,7 +813,7 @@ impl PostgresRedoManager { }?; if n == 0 { - return Err(Error::new(ErrorKind::Other, "WAL redo timed out")); + anyhow::bail!("WAL redo timed out"); } // If we have some messages in stderr, forward them to the log. @@ -855,10 +837,7 @@ impl PostgresRedoManager { continue; } } else if err_revents.contains(PollFlags::POLLHUP) { - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stderr unexpectedly", - )); + anyhow::bail!("WAL redo process closed its stderr unexpectedly"); } // If 'stdin' is writeable, do write. @@ -867,10 +846,7 @@ impl PostgresRedoManager { nwrite += proc.stdin.write(&writebuf[nwrite..])?; } else if in_revents.contains(PollFlags::POLLHUP) { // We still have more data to write, but the process closed the pipe. - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stdin unexpectedly", - )); + anyhow::bail!("WAL redo process closed its stdin unexpectedly"); } } let request_no = proc.n_requests; @@ -901,10 +877,7 @@ impl PostgresRedoManager { // // Cross-read this with the comment in apply_batch_postgres if result.is_err(). // That's where we kill the child process. - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stdout unexpectedly", - )); + anyhow::bail!("WAL redo process closed its stdout unexpectedly"); } let n_processed_responses = output.n_processed_responses; while n_processed_responses + output.pending_responses.len() <= request_no { @@ -923,7 +896,7 @@ impl PostgresRedoManager { }?; if n == 0 { - return Err(Error::new(ErrorKind::Other, "WAL redo timed out")); + anyhow::bail!("WAL redo timed out"); } // If we have some messages in stderr, forward them to the log. @@ -947,10 +920,7 @@ impl PostgresRedoManager { continue; } } else if err_revents.contains(PollFlags::POLLHUP) { - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stderr unexpectedly", - )); + anyhow::bail!("WAL redo process closed its stderr unexpectedly"); } // If we have some data in stdout, read it to the result buffer. @@ -958,10 +928,7 @@ impl PostgresRedoManager { if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() { nresult += output.stdout.read(&mut resultbuf[nresult..])?; } else if out_revents.contains(PollFlags::POLLHUP) { - return Err(Error::new( - ErrorKind::BrokenPipe, - "WAL redo process closed its stdout unexpectedly", - )); + anyhow::bail!("WAL redo process closed its stdout unexpectedly"); } } output From c3626e34325661a6a5a2780f63910c6075f6dea4 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 13 Oct 2023 20:16:15 +0200 Subject: [PATCH 10/18] walredo: remove legacy wal-redo-datadir cleanup code (#5554) It says it in the comment. --- pageserver/src/walredo.rs | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index fc569c474a..b8f2180c97 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -24,6 +24,7 @@ use bytes::{BufMut, Bytes, BytesMut}; use nix::poll::*; use serde::Serialize; use std::collections::VecDeque; +use std::io; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; @@ -34,14 +35,13 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; use std::sync::{Mutex, MutexGuard}; use std::time::Duration; use std::time::Instant; -use std::{fs, io}; use tracing::*; -use utils::crashsafe::path_with_suffix_extension; use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock}; #[cfg(feature = "testing")] use std::sync::atomic::{AtomicUsize, Ordering}; +use crate::config::PageServerConf; use crate::metrics::{ WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, WAL_REDO_WAIT_TIME, @@ -50,7 +50,6 @@ use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::repository::Key; use crate::task_mgr::BACKGROUND_RUNTIME; use crate::walrecord::NeonWalRecord; -use crate::{config::PageServerConf, TEMP_FILE_SUFFIX}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; @@ -634,26 +633,6 @@ impl PostgresRedoManager { input: &mut MutexGuard>, pg_version: u32, ) -> Result<(), Error> { - // Previous versions of wal-redo required data directory and that directories - // occupied some space on disk. Remove it if we face it. - // - // This code could be dropped after one release cycle. - let legacy_datadir = path_with_suffix_extension( - self.conf - .tenant_path(&self.tenant_id) - .join("wal-redo-datadir"), - TEMP_FILE_SUFFIX, - ); - if legacy_datadir.exists() { - info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing"); - fs::remove_dir_all(&legacy_datadir).map_err(|e| { - Error::new( - e.kind(), - format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"), - ) - })?; - } - let pg_bin_dir_path = self .conf .pg_bin_dir(pg_version) From 99c15907c1f5856f7ee49f55d9363e9b1ad0179d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 13 Oct 2023 20:35:53 +0200 Subject: [PATCH 11/18] walredo: trim public interfaces (#5556) Stacked atop https://github.com/neondatabase/neon/pull/5554. --- pageserver/src/walredo.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b8f2180c97..28e5f997bb 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -66,7 +66,7 @@ use postgres_ffi::BLCKSZ; /// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91). /// #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)] -pub struct BufferTag { +pub(crate) struct BufferTag { pub rel: RelTag, pub blknum: u32, } @@ -223,15 +223,6 @@ impl PostgresRedoManager { } } - /// Launch process pre-emptively. Should not be needed except for benchmarking. - pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> { - let mut proc = self.stdin.lock().unwrap(); - if proc.is_none() { - self.launch(&mut proc, pg_version)?; - } - Ok(()) - } - /// /// Process one request for WAL redo using wal-redo postgres /// From 44b1c4c456b47b7fbfed0008ca9be6e6bdf13f04 Mon Sep 17 00:00:00 2001 From: John Spray Date: Sun, 15 Oct 2023 20:23:18 +0100 Subject: [PATCH 12/18] pageserver: fix eviction across generations (#5538) ## Problem Bug was introduced by me in 83ae2bd82c When eviction constructs a RemoteLayer to replace the layer it just evicted, it is building a LayerFileMetadata using its _current_ generation, rather than the generation of the layer. ## Summary of changes - Retrieve Generation from RemoteTimelineClient when evicting. This will no longer be necessary when #4938 lands. - Add a test for the scenario in question (this fails without the fix). --- pageserver/src/disk_usage_eviction_task.rs | 5 ++ .../src/tenant/remote_timeline_client.rs | 7 +++ pageserver/src/tenant/timeline.rs | 22 ++++++- .../src/tenant/timeline/eviction_task.rs | 4 ++ pageserver/src/tenant/upload_queue.rs | 12 ++++ .../regress/test_pageserver_generations.py | 61 +++++++++++++++++++ 6 files changed, 110 insertions(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index b8af4c3d11..b4d73baa88 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -411,6 +411,11 @@ pub async fn disk_usage_eviction_task_iteration_impl( evictions_failed.file_sizes += file_size; evictions_failed.count += 1; } + Some(Err(EvictionError::MetadataInconsistency(detail))) => { + warn!(%layer, "failed to evict layer: {detail}"); + evictions_failed.file_sizes += file_size; + evictions_failed.count += 1; + } None => { assert!(cancel.is_cancelled()); return; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 622f9b1b9e..06e2292156 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1419,6 +1419,13 @@ impl RemoteTimelineClient { } } } + + pub(crate) fn get_layer_metadata( + &self, + name: &LayerFileName, + ) -> anyhow::Result> { + self.upload_queue.lock().unwrap().get_layer_metadata(name) + } } pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 85b33b98b9..537f776176 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1294,7 +1294,23 @@ impl Timeline { Ok(delta) => Some(delta), }; - let layer_metadata = LayerFileMetadata::new(layer_file_size, self.generation); + // RemoteTimelineClient holds the metadata on layers' remote generations, so + // query it to construct a RemoteLayer. + let layer_metadata = self + .remote_client + .as_ref() + .expect("Eviction is not called without remote storage") + .get_layer_metadata(&local_layer.filename()) + .map_err(EvictionError::LayerNotFound)? + .ok_or_else(|| { + EvictionError::LayerNotFound(anyhow::anyhow!("Layer not in remote metadata")) + })?; + if layer_metadata.file_size() != layer_file_size { + return Err(EvictionError::MetadataInconsistency(format!( + "Layer size {layer_file_size} doesn't match remote metadata file size {}", + layer_metadata.file_size() + ))); + } let new_remote_layer = Arc::new(match local_layer.filename() { LayerFileName::Image(image_name) => RemoteLayer::new_img( @@ -1373,6 +1389,10 @@ pub(crate) enum EvictionError { /// different objects in memory. #[error("layer was no longer part of LayerMap")] LayerNotFound(#[source] anyhow::Error), + + /// This should never happen + #[error("Metadata inconsistency")] + MetadataInconsistency(String), } /// Number of times we will compute partition within a checkpoint distance. diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 39f0d03a01..9bf31d85d4 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -285,6 +285,10 @@ impl Timeline { warn!(layer = %l, "failed to evict layer: {e}"); stats.not_evictable += 1; } + Some(Err(EvictionError::MetadataInconsistency(detail))) => { + warn!(layer = %l, "failed to evict layer: {detail}"); + stats.not_evictable += 1; + } } } if stats.candidates == stats.not_evictable { diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 8150e71c95..2db67d071a 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -203,6 +203,18 @@ impl UploadQueue { UploadQueue::Stopped(stopped) => Ok(stopped), } } + + pub(crate) fn get_layer_metadata( + &self, + name: &LayerFileName, + ) -> anyhow::Result> { + match self { + UploadQueue::Stopped(_) | UploadQueue::Uninitialized => { + anyhow::bail!("queue is in state {}", self.as_str()) + } + UploadQueue::Initialized(inner) => Ok(inner.latest_files.get(name).cloned()), + } + } } /// An in-progress upload or delete task. diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 1b9f48706f..6dc8582755 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -104,6 +104,22 @@ def generate_uploads_and_deletions( assert gc_result["layers_removed"] > 0 +def read_all( + env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None +): + if tenant_id is None: + tenant_id = env.initial_tenant + assert tenant_id is not None + + if timeline_id is None: + timeline_id = env.initial_timeline + assert timeline_id is not None + + env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: + endpoint.safe_psql("SELECT SUM(LENGTH(val)) FROM foo;") + + def get_metric_or_0(ps_http, metric: str) -> int: v = ps_http.get_metric_value(metric) return 0 if v is None else int(v) @@ -467,3 +483,48 @@ def test_emergency_mode(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): assert get_deletion_queue_depth(ps_http) == 0 assert get_deletion_queue_validated(ps_http) > 0 assert get_deletion_queue_executed(ps_http) > 0 + + +def evict_all_layers(env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id) + initial_local_layers = sorted( + list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + ) + client = env.pageserver.http_client() + for layer in initial_local_layers: + if "ephemeral" in layer.name: + continue + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") + client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + + +def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): + """ + Eviction and on-demand downloads exercise a particular code path where RemoteLayer is constructed + and must be constructed using the proper generation for the layer, which may not be the same generation + that the tenant is running in. + """ + neon_env_builder.enable_generations = True + neon_env_builder.enable_pageserver_remote_storage( + RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + generate_uploads_and_deletions(env) + + read_all(env, tenant_id, timeline_id) + evict_all_layers(env, tenant_id, timeline_id) + read_all(env, tenant_id, timeline_id) + + # This will cause the generation to increment + env.pageserver.stop() + env.pageserver.start() + + # Now we are running as generation 2, but must still correctly remember that the layers + # we are evicting and downloading are from generation 1. + read_all(env, tenant_id, timeline_id) + evict_all_layers(env, tenant_id, timeline_id) + read_all(env, tenant_id, timeline_id) From 8c522ea034b9ae5533c4ded654f7d50b5b05a514 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 16 Oct 2023 16:31:04 +0100 Subject: [PATCH 13/18] proxy: count cache-miss for compute latency (#5539) ## Problem Would be good to view latency for hot-path vs cold-path ## Summary of changes add some labels to latency metrics --- proxy/src/http/conn_pool.rs | 10 ++++-- proxy/src/proxy.rs | 71 +++++++++++++++++++++++++++++++------ proxy/src/proxy/tests.rs | 14 ++++---- 3 files changed, 74 insertions(+), 21 deletions(-) diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs index 7258f1a2cf..b268c4073e 100644 --- a/proxy/src/http/conn_pool.rs +++ b/proxy/src/http/conn_pool.rs @@ -20,7 +20,7 @@ use tokio_postgres::AsyncMessage; use crate::{ auth, console, metrics::{Ids, MetricCounter, USAGE_METRICS}, - proxy::{NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER}, + proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER}, }; use crate::{compute, config}; @@ -139,6 +139,7 @@ impl GlobalConnPool { session_id: uuid::Uuid, ) -> anyhow::Result { let mut client: Option = None; + let mut latency_timer = LatencyTimer::new("http"); let mut hash_valid = false; if !force_new { @@ -182,15 +183,16 @@ impl GlobalConnPool { let new_client = if let Some(client) = client { if client.inner.is_closed() { info!("pool: cached connection '{conn_info}' is closed, opening a new one"); - connect_to_compute(self.proxy_config, conn_info, session_id).await + connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await } else { + latency_timer.pool_hit(); info!("pool: reusing connection '{conn_info}'"); client.session.send(session_id)?; return Ok(client); } } else { info!("pool: opening a new connection '{conn_info}'"); - connect_to_compute(self.proxy_config, conn_info, session_id).await + connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await }; match &new_client { @@ -347,6 +349,7 @@ async fn connect_to_compute( config: &config::ProxyConfig, conn_info: &ConnInfo, session_id: uuid::Uuid, + latency_timer: LatencyTimer, ) -> anyhow::Result { let tls = config.tls_config.as_ref(); let common_names = tls.and_then(|tls| tls.common_names.clone()); @@ -386,6 +389,7 @@ async fn connect_to_compute( node_info, &extra, &creds, + latency_timer, ) .await } diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 9cfe98347b..6a928547a9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -15,12 +15,11 @@ use crate::{ use anyhow::{bail, Context}; use async_trait::async_trait; use futures::TryFutureExt; -use metrics::{ - exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec, -}; +use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec}; use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; -use std::{error::Error, io, ops::ControlFlow, sync::Arc}; +use prometheus::{register_histogram_vec, HistogramVec}; +use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant}; use tokio::{ io::{AsyncRead, AsyncWrite, AsyncWriteExt}, time, @@ -93,16 +92,57 @@ pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { .unwrap() }); -static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( +static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( "proxy_compute_connection_latency_seconds", "Time it took for proxy to establish a connection to the compute endpoint", + &["protocol", "cache_miss", "pool_miss"], // largest bucket = 2^16 * 0.5ms = 32s exponential_buckets(0.0005, 2.0, 16).unwrap(), ) .unwrap() }); +pub struct LatencyTimer { + start: Instant, + pool_miss: bool, + cache_miss: bool, + protocol: &'static str, +} + +impl LatencyTimer { + pub fn new(protocol: &'static str) -> Self { + Self { + start: Instant::now(), + cache_miss: false, + // by default we don't do pooling + pool_miss: true, + protocol, + } + } + + pub fn cache_miss(&mut self) { + self.cache_miss = true; + } + + pub fn pool_hit(&mut self) { + self.pool_miss = false; + } +} + +impl Drop for LatencyTimer { + fn drop(&mut self) { + let duration = self.start.elapsed().as_secs_f64(); + COMPUTE_CONNECTION_LATENCY + .with_label_values(&[ + self.protocol, + bool_to_str(self.cache_miss), + bool_to_str(self.pool_miss), + ]) + .observe(duration) + } +} + static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { register_int_counter_vec!( "proxy_connection_failures_total", @@ -495,13 +535,12 @@ pub async fn connect_to_compute( mut node_info: console::CachedNodeInfo, extra: &console::ConsoleReqExtra<'_>, creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, + mut latency_timer: LatencyTimer, ) -> Result where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { - let _timer = COMPUTE_CONNECTION_LATENCY.start_timer(); - mechanism.update_connect_config(&mut node_info.config); // try once @@ -513,6 +552,8 @@ where } }; + latency_timer.cache_miss(); + let mut num_retries = 1; // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node @@ -789,6 +830,8 @@ impl Client<'_, S> { application_name: params.get("application_name"), }; + let latency_timer = LatencyTimer::new(mode.protocol_label()); + let auth_result = match creds .authenticate(&extra, &mut stream, mode.allow_cleartext()) .await @@ -812,9 +855,15 @@ impl Client<'_, S> { node_info.allow_self_signed_compute = allow_self_signed_compute; let aux = node_info.aux.clone(); - let mut node = connect_to_compute(&TcpMechanism { params }, node_info, &extra, &creds) - .or_else(|e| stream.throw_error(e)) - .await?; + let mut node = connect_to_compute( + &TcpMechanism { params }, + node_info, + &extra, + &creds, + latency_timer, + ) + .or_else(|e| stream.throw_error(e)) + .await?; let proto = mode.protocol_label(); NUM_DB_CONNECTIONS_OPENED_COUNTER diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 9615017883..8f7f461bef 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -450,7 +450,7 @@ async fn connect_to_compute_success() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Connect]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap(); mechanism.verify(); @@ -461,7 +461,7 @@ async fn connect_to_compute_retry() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap(); mechanism.verify(); @@ -473,7 +473,7 @@ async fn connect_to_compute_non_retry_1() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap_err(); mechanism.verify(); @@ -485,7 +485,7 @@ async fn connect_to_compute_non_retry_2() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap(); mechanism.verify(); @@ -501,7 +501,7 @@ async fn connect_to_compute_non_retry_3() { Retry, Retry, Retry, Retry, /* the 17th time */ Retry, ]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap_err(); mechanism.verify(); @@ -513,7 +513,7 @@ async fn wake_retry() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap(); mechanism.verify(); @@ -525,7 +525,7 @@ async fn wake_non_retry() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); let (cache, extra, creds) = helper_create_connect_info(&mechanism); - connect_to_compute(&mechanism, cache, &extra, &creds) + connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test")) .await .unwrap_err(); mechanism.verify(); From e09d5ada6a7437595cff5b460386ac32bdb03780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 16 Oct 2023 17:37:09 +0200 Subject: [PATCH 14/18] Azure blob storage support (#5546) Adds prototype-level support for [Azure blob storage](https://azure.microsoft.com/en-us/products/storage/blobs). Some corners were cut, see the TODOs and the followup issue #5567 for details. Steps to try it out: * Create a storage account with block blobs (this is a per-storage account setting). * Create a container inside that storage account. * Set the appropriate env vars: `AZURE_STORAGE_ACCOUNT, AZURE_STORAGE_ACCESS_KEY, REMOTE_STORAGE_AZURE_CONTAINER, REMOTE_STORAGE_AZURE_REGION` * Set the env var `ENABLE_REAL_AZURE_REMOTE_STORAGE=y` and run `cargo test -p remote_storage azure` Fixes #5562 --- Cargo.lock | 946 +++++++++++++++++-- Cargo.toml | 4 + control_plane/src/background_process.rs | 6 +- docs/pageserver-services.md | 10 + libs/remote_storage/Cargo.toml | 7 + libs/remote_storage/src/azure_blob.rs | 381 ++++++++ libs/remote_storage/src/lib.rs | 198 +++- libs/remote_storage/src/s3_bucket.rs | 54 +- libs/remote_storage/src/s3_bucket/metrics.rs | 2 +- libs/remote_storage/tests/test_real_azure.rs | 619 ++++++++++++ workspace_hack/Cargo.toml | 12 +- 11 files changed, 2093 insertions(+), 146 deletions(-) create mode 100644 libs/remote_storage/src/azure_blob.rs create mode 100644 libs/remote_storage/tests/test_real_azure.rs diff --git a/Cargo.lock b/Cargo.lock index be3f179d5f..aacf4e53d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.19.0" @@ -17,6 +23,60 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aead" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fc95d1bdb8e6666b2b217308eeeb09f2d6728d104be3e31916cc74d15420331" +dependencies = [ + "generic-array", +] + +[[package]] +name = "aes" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "884391ef1066acaa41e766ba8f596341b96e93ce34f9a43e7d24bf0a0eaf0561" +dependencies = [ + "aes-soft", + "aesni", + "cipher", +] + +[[package]] +name = "aes-gcm" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5278b5fabbb9bd46e24aa69b2fdea62c99088e0a950a9be40e3e0101298f88da" +dependencies = [ + "aead", + "aes", + "cipher", + "ctr", + "ghash", + "subtle", +] + +[[package]] +name = "aes-soft" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be14c7498ea50828a38d0e24a765ed2effe92a705885b57d029cd67d45744072" +dependencies = [ + "cipher", + "opaque-debug", +] + +[[package]] +name = "aesni" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2e11f5e94c2f7d386164cc2aa1f97823fed6f259e486940a71c174dd01b0ce" +dependencies = [ + "cipher", + "opaque-debug", +] + [[package]] name = "ahash" version = "0.8.3" @@ -132,7 +192,7 @@ dependencies = [ "num-traits", "rusticata-macros", "thiserror", - "time", + "time 0.3.21", ] [[package]] @@ -158,6 +218,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener", + "futures-core", +] + [[package]] name = "async-compression" version = "0.4.0" @@ -171,6 +242,90 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-executor" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c1da3ae8dabd9c00f453a329dfe1fb28da3c0a72e2478cdcd93171740c20499" +dependencies = [ + "async-lock", + "async-task", + "concurrent-queue", + "fastrand 2.0.0", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1b6f5d7df27bd294849f8eec66ecfc63d11814df7a4f5d74168a2394467b776" +dependencies = [ + "async-channel", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc5b45d93ef0529756f812ca52e44c221b35341892d3dcc34132ac02f3dd2af" +dependencies = [ + "async-lock", + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-lite", + "log", + "parking", + "polling", + "rustix 0.37.19", + "slab", + "socket2 0.4.9", + "waker-fn", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener", +] + +[[package]] +name = "async-std" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62565bb4402e926b29953c785397c6dc0391b7b446e45008b0049eb43cec6f5d" +dependencies = [ + "async-channel", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -193,6 +348,12 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "async-task" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9441c6b2fe128a7c2bf680a44c34d0df31ce09e5b7e401fcca3faa483dbc921" + [[package]] name = "async-trait" version = "0.1.68" @@ -213,6 +374,12 @@ dependencies = [ "critical-section", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.1.0" @@ -242,7 +409,7 @@ dependencies = [ "http", "hyper", "ring", - "time", + "time 0.3.21", "tokio", "tower", "tracing", @@ -395,13 +562,13 @@ dependencies = [ "bytes", "form_urlencoded", "hex", - "hmac", + "hmac 0.12.1", "http", "once_cell", "percent-encoding", "regex", - "sha2", - "time", + "sha2 0.10.6", + "time 0.3.21", "tracing", ] @@ -433,8 +600,8 @@ dependencies = [ "http-body", "md-5", "pin-project-lite", - "sha1", - "sha2", + "sha1 0.10.5", + "sha2 0.10.6", "tracing", ] @@ -579,7 +746,7 @@ dependencies = [ "num-integer", "ryu", "serde", - "time", + "time 0.3.21", ] [[package]] @@ -603,7 +770,7 @@ dependencies = [ "aws-smithy-http", "aws-smithy-types", "http", - "rustc_version", + "rustc_version 0.4.0", "tracing", ] @@ -633,7 +800,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_urlencoded", - "sha1", + "sha1 0.10.5", "sync_wrapper", "tokio", "tokio-tungstenite", @@ -659,6 +826,75 @@ dependencies = [ "tower-service", ] +[[package]] +name = "azure_core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467" +dependencies = [ + "async-trait", + "base64 0.21.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.9", + "http-types", + "log", + "paste", + "pin-project", + "quick-xml", + "rand 0.8.5", + "reqwest", + "rustc_version 0.4.0", + "serde", + "serde_json", + "time 0.3.21", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97" +dependencies = [ + "RustyXML", + "async-trait", + "azure_core", + "bytes", + "futures", + "hmac 0.12.1", + "log", + "serde", + "serde_derive", + "serde_json", + "sha2 0.10.6", + "time 0.3.21", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "bytes", + "futures", + "log", + "serde", + "serde_derive", + "serde_json", + "time 0.3.21", + "url", + "uuid", +] + [[package]] name = "backtrace" version = "0.3.67" @@ -674,6 +910,12 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base-x" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cbbc9d0964165b47557570cce6c952866c2678457aca742aafc9fb771d30270" + [[package]] name = "base64" version = "0.13.1" @@ -746,6 +988,15 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -755,6 +1006,22 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c36a4d0d48574b3dd360b4b7d95cc651d2b6557b6402848a27d4b228a473e2a" +dependencies = [ + "async-channel", + "async-lock", + "async-task", + "fastrand 2.0.0", + "futures-io", + "futures-lite", + "piper", + "tracing", +] + [[package]] name = "bstr" version = "1.5.0" @@ -897,6 +1164,15 @@ dependencies = [ "half", ] +[[package]] +name = "cipher" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f8e7987cbd042a63249497f41aed09f8e65add917ea6566effbc56578d6801" +dependencies = [ + "generic-array", +] + [[package]] name = "clang-sys" version = "1.6.1" @@ -1031,6 +1307,21 @@ dependencies = [ "zstd", ] +[[package]] +name = "concurrent-queue" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f057a694a54f12365049b0958a1685bb52d567f5593b355fbf685838e873d400" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "const_fn" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbdcdcb6d86f71c5e97409ad45898af11cbc995b4ee8112d59095a28d376c935" + [[package]] name = "const_format" version = "0.2.30" @@ -1057,7 +1348,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", - "rand", + "rand 0.8.5", "serde", "serde_with", "utils", @@ -1099,6 +1390,23 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "cookie" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a5d7b21829bc7b4bf4754a978a241ae54ea55a40f92bb20216e54096f4b951" +dependencies = [ + "aes-gcm", + "base64 0.13.1", + "hkdf", + "hmac 0.10.1", + "percent-encoding", + "rand 0.8.5", + "sha2 0.9.9", + "time 0.2.27", + "version_check", +] + [[package]] name = "core-foundation" version = "0.9.3" @@ -1124,13 +1432,19 @@ dependencies = [ "libc", ] +[[package]] +name = "cpuid-bool" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb25d077389e53838a8158c8e99174c5a9d902dee4904320db714f3c653ffba" + [[package]] name = "crc32c" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e" dependencies = [ - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -1262,6 +1576,35 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-mac" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4857fd85a0c34b3c3297875b747c1e02e06b6a0ea32dd892d8192b9ce0813ea6" +dependencies = [ + "generic-array", + "subtle", +] + +[[package]] +name = "ctor" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096" +dependencies = [ + "quote", + "syn 1.0.109", +] + +[[package]] +name = "ctr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb4a30d54f7443bf3d6191dcd486aca19e67cb3c49fa7a06a319966346707e7f" +dependencies = [ + "cipher", +] + [[package]] name = "darling" version = "0.20.1" @@ -1340,17 +1683,32 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", + "block-buffer 0.10.4", "crypto-common", "subtle", ] +[[package]] +name = "discard" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "212d0f5754cb6769937f4501cc0e67f4f4483c8d2c3e1e922ee9edbe4ab4c7c0" + [[package]] name = "displaydoc" version = "0.2.4" @@ -1362,6 +1720,12 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "dyn-clone" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d2f3407d9a573d666de4b5bdf10569d73ca9478087346697dcbae6244bfbcd" + [[package]] name = "either" version = "1.8.1" @@ -1452,6 +1816,12 @@ dependencies = [ "libc", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "fail" version = "0.5.1" @@ -1460,7 +1830,7 @@ checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" dependencies = [ "log", "once_cell", - "rand", + "rand 0.8.5", ] [[package]] @@ -1609,6 +1979,21 @@ version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.28" @@ -1666,6 +2051,17 @@ dependencies = [ "version_check", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.9" @@ -1675,10 +2071,20 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "ghash" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97304e4cd182c3846f7575ced3890c53012ce534ad9114046b0a9e00bb30a375" +dependencies = [ + "opaque-debug", + "polyval", +] + [[package]] name = "gimli" version = "0.27.2" @@ -1713,6 +2119,18 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "gloo-timers" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b995a66bb87bebce9a0f4a95aed01daca4872c050bfcb21653361c03bc35e5c" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "h2" version = "0.3.19" @@ -1784,7 +2202,7 @@ source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963b dependencies = [ "atomic-polyfill", "hash32", - "rustc_version", + "rustc_version 0.4.0", "spin 0.9.8", "stable_deref_trait", ] @@ -1826,13 +2244,33 @@ dependencies = [ "thiserror", ] +[[package]] +name = "hkdf" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51ab2f639c231793c5f6114bdb9bbe50a7dbbfcd7c7c6bd8475dec2d991e964f" +dependencies = [ + "digest 0.9.0", + "hmac 0.10.1", +] + +[[package]] +name = "hmac" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" +dependencies = [ + "crypto-mac", + "digest 0.9.0", +] + [[package]] name = "hmac" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest", + "digest 0.10.7", ] [[package]] @@ -1868,6 +2306,28 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "async-std", + "base64 0.13.1", + "cookie", + "futures-lite", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.8.0" @@ -1947,6 +2407,19 @@ dependencies = [ "tokio-io-timeout", ] +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + [[package]] name = "hyper-tungstenite" version = "0.11.1" @@ -2010,6 +2483,12 @@ dependencies = [ "serde", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inotify" version = "0.9.6" @@ -2151,6 +2630,15 @@ dependencies = [ "libc", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -2208,6 +2696,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ "cfg-if", + "value-bag", ] [[package]] @@ -2237,7 +2726,7 @@ version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" dependencies = [ - "digest", + "digest 0.10.7", ] [[package]] @@ -2329,7 +2818,7 @@ checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.45.0", ] @@ -2490,6 +2979,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + [[package]] name = "openssl" version = "0.10.55" @@ -2629,7 +3124,7 @@ dependencies = [ "once_cell", "opentelemetry_api", "percent-encoding", - "rand", + "rand 0.8.5", "thiserror", "tokio", "tokio-stream", @@ -2716,7 +3211,7 @@ dependencies = [ "postgres_connection", "postgres_ffi", "pq_proto", - "rand", + "rand 0.8.5", "regex", "remote_storage", "reqwest", @@ -2766,6 +3261,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "parking" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52c774a4c39359c1d1c52e43f73dd91a75a614652c825408eec30c95a9b2067" + [[package]] name = "parking_lot" version = "0.11.2" @@ -2821,20 +3322,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" dependencies = [ "base64ct", - "rand_core", + "rand_core 0.6.4", "subtle", ] +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + [[package]] name = "pbkdf2" version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31" dependencies = [ - "digest", - "hmac", + "digest 0.10.7", + "hmac 0.12.1", "password-hash", - "sha2", + "sha2 0.10.6", ] [[package]] @@ -2928,6 +3435,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "668d31b1c4eba19242f2088b2bf3316b82ca31082a8335764db4e083db7485d4" +dependencies = [ + "atomic-waker", + "fastrand 2.0.0", + "futures-io", +] + [[package]] name = "pkg-config" version = "0.3.27" @@ -2962,6 +3480,33 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polling" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b2d323e8ca7996b3e23126511a523f7e62924d93ecd5ae73b333815b0eb3dce" +dependencies = [ + "autocfg", + "bitflags", + "cfg-if", + "concurrent-queue", + "libc", + "log", + "pin-project-lite", + "windows-sys 0.48.0", +] + +[[package]] +name = "polyval" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eebcc4aa140b9abd2bc40d9c3f7ccec842679cd79045ac3a7ac698c1a064b7cd" +dependencies = [ + "cpuid-bool", + "opaque-debug", + "universal-hash", +] + [[package]] name = "postgres" version = "0.19.4" @@ -2995,12 +3540,12 @@ dependencies = [ "byteorder", "bytes", "fallible-iterator", - "hmac", + "hmac 0.12.1", "lazy_static", "md-5", "memchr", - "rand", - "sha2", + "rand 0.8.5", + "sha2 0.10.6", "stringprep", ] @@ -3064,7 +3609,7 @@ dependencies = [ "memoffset 0.8.0", "once_cell", "postgres", - "rand", + "rand 0.8.5", "regex", "serde", "thiserror", @@ -3086,7 +3631,7 @@ dependencies = [ "bytes", "pin-project-lite", "postgres-protocol", - "rand", + "rand 0.8.5", "thiserror", "tokio", "tracing", @@ -3229,7 +3774,7 @@ dependencies = [ "hashbrown 0.13.2", "hashlink", "hex", - "hmac", + "hmac 0.12.1", "hostname", "humantime", "hyper", @@ -3247,7 +3792,7 @@ dependencies = [ "postgres_backend", "pq_proto", "prometheus", - "rand", + "rand 0.8.5", "rcgen", "regex", "reqwest", @@ -3262,7 +3807,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sha2", + "sha2 0.10.6", "socket2 0.5.3", "sync_wrapper", "thiserror", @@ -3284,6 +3829,16 @@ dependencies = [ "x509-parser", ] +[[package]] +name = "quick-xml" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.32" @@ -3293,6 +3848,19 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -3300,8 +3868,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", ] [[package]] @@ -3311,7 +3889,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -3320,7 +3907,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.9", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", ] [[package]] @@ -3353,7 +3949,7 @@ checksum = "4954fbc00dcd4d8282c987710e50ba513d351400dbdd00e803a05172a90d8976" dependencies = [ "pem 2.0.1", "ring", - "time", + "time 0.3.21", "yasna", ] @@ -3424,13 +4020,20 @@ dependencies = [ "aws-sdk-s3", "aws-smithy-http", "aws-types", + "azure_core", + "azure_storage", + "azure_storage_blobs", + "bytes", "camino", "camino-tempfile", + "futures-util", + "http-types", "hyper", + "itertools", "metrics", "once_cell", "pin-project-lite", - "rand", + "rand 0.8.5", "scopeguard", "serde", "serde_json", @@ -3459,11 +4062,13 @@ dependencies = [ "http-body", "hyper", "hyper-rustls", + "hyper-tls", "ipnet", "js-sys", "log", "mime", "mime_guess", + "native-tls", "once_cell", "percent-encoding", "pin-project-lite", @@ -3473,11 +4078,14 @@ dependencies = [ "serde_json", "serde_urlencoded", "tokio", + "tokio-native-tls", "tokio-rustls", + "tokio-util", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", "webpki-roots 0.25.2", "winreg", @@ -3508,7 +4116,7 @@ dependencies = [ "async-trait", "chrono", "futures", - "getrandom", + "getrandom 0.2.9", "http", "hyper", "parking_lot 0.11.2", @@ -3529,7 +4137,7 @@ checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8" dependencies = [ "anyhow", "async-trait", - "getrandom", + "getrandom 0.2.9", "matchit", "opentelemetry", "reqwest", @@ -3547,7 +4155,7 @@ checksum = "e09bbcb5003282bcb688f0bae741b278e9c7e8f378f561522c9806c58e075d9b" dependencies = [ "anyhow", "chrono", - "rand", + "rand 0.8.5", ] [[package]] @@ -3596,7 +4204,7 @@ dependencies = [ "futures", "futures-timer", "rstest_macros", - "rustc_version", + "rustc_version 0.4.0", ] [[package]] @@ -3611,7 +4219,7 @@ dependencies = [ "quote", "regex", "relative-path", - "rustc_version", + "rustc_version 0.4.0", "syn 2.0.28", "unicode-ident", ] @@ -3628,13 +4236,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver 0.9.0", +] + [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ - "semver", + "semver 1.0.17", ] [[package]] @@ -3760,7 +4377,7 @@ dependencies = [ "histogram", "itertools", "pageserver", - "rand", + "rand 0.8.5", "reqwest", "serde", "serde_json", @@ -3894,12 +4511,27 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + [[package]] name = "semver" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "sentry" version = "0.31.6" @@ -3940,7 +4572,7 @@ dependencies = [ "hostname", "libc", "os_info", - "rustc_version", + "rustc_version 0.4.0", "sentry-core", "uname", ] @@ -3952,7 +4584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055" dependencies = [ "once_cell", - "rand", + "rand 0.8.5", "sentry-types", "serde", "serde_json", @@ -3987,12 +4619,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd" dependencies = [ "debugid", - "getrandom", + "getrandom 0.2.9", "hex", "serde", "serde_json", "thiserror", - "time", + "time 0.3.21", "url", "uuid", ] @@ -4038,6 +4670,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror", +] + [[package]] name = "serde_spanned" version = "0.6.2" @@ -4072,7 +4715,7 @@ dependencies = [ "serde", "serde_json", "serde_with_macros", - "time", + "time 0.3.21", ] [[package]] @@ -4087,6 +4730,15 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "sha1" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1da05c97445caa12d05e848c4a4fcbbea29e748ac28f7e80e9b010392063770" +dependencies = [ + "sha1_smol", +] + [[package]] name = "sha1" version = "0.10.5" @@ -4095,7 +4747,26 @@ checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" dependencies = [ "cfg-if", "cpufeatures", - "digest", + "digest 0.10.7", +] + +[[package]] +name = "sha1_smol" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" + +[[package]] +name = "sha2" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58a1e1bf39749807d89cf2d98ac2dfa0ff1cb3faa38fbb64dd88ac8013d800" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if", + "cpufeatures", + "digest 0.9.0", + "opaque-debug", ] [[package]] @@ -4106,7 +4777,7 @@ checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" dependencies = [ "cfg-if", "cpufeatures", - "digest", + "digest 0.10.7", ] [[package]] @@ -4163,7 +4834,7 @@ dependencies = [ "num-bigint", "num-traits", "thiserror", - "time", + "time 0.3.21", ] [[package]] @@ -4228,12 +4899,70 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "standback" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e113fb6f3de07a243d434a56ec6f186dfd51cb08448239fe7bcae73f87ff28ff" +dependencies = [ + "version_check", +] + [[package]] name = "static_assertions" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "stdweb" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5" +dependencies = [ + "discard", + "rustc_version 0.2.3", + "stdweb-derive", + "stdweb-internal-macros", + "stdweb-internal-runtime", + "wasm-bindgen", +] + +[[package]] +name = "stdweb-derive" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c87a60a40fccc84bef0652345bbbbbe20a605bf5d0ce81719fc476f5c03b50ef" +dependencies = [ + "proc-macro2", + "quote", + "serde", + "serde_derive", + "syn 1.0.109", +] + +[[package]] +name = "stdweb-internal-macros" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58fa5ff6ad0d98d1ffa8cb115892b6e69d67799f6763e162a1c9db421dc22e11" +dependencies = [ + "base-x", + "proc-macro2", + "quote", + "serde", + "serde_derive", + "serde_json", + "sha1 0.6.1", + "syn 1.0.109", +] + +[[package]] +name = "stdweb-internal-runtime" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0" + [[package]] name = "storage_broker" version = "0.1.0" @@ -4467,6 +5196,21 @@ dependencies = [ "once_cell", ] +[[package]] +name = "time" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4752a97f8eebd6854ff91f1c1824cd6160626ac4bd44287f7f4ea2035a02a242" +dependencies = [ + "const_fn", + "libc", + "standback", + "stdweb", + "time-macros 0.1.1", + "version_check", + "winapi", +] + [[package]] name = "time" version = "0.3.21" @@ -4474,9 +5218,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" dependencies = [ "itoa", + "js-sys", "serde", "time-core", - "time-macros", + "time-macros 0.2.9", ] [[package]] @@ -4485,6 +5230,16 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +[[package]] +name = "time-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957e9c6e26f12cb6d0dd7fc776bb67a706312e7299aed74c8dd5b17ebb27e2f1" +dependencies = [ + "proc-macro-hack", + "time-macros-impl", +] + [[package]] name = "time-macros" version = "0.2.9" @@ -4494,6 +5249,19 @@ dependencies = [ "time-core", ] +[[package]] +name = "time-macros-impl" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3c141a1b43194f3f56a1411225df8646c55781d5f26db825b3d98507eb482f" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "standback", + "syn 1.0.109", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -4803,7 +5571,7 @@ dependencies = [ "indexmap", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -4855,7 +5623,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09d48f71a791638519505cefafe162606f706c25592e4bde4d97600c0195312e" dependencies = [ "crossbeam-channel", - "time", + "time 0.3.21", "tracing-subscriber", ] @@ -4989,8 +5757,8 @@ dependencies = [ "http", "httparse", "log", - "rand", - "sha1", + "rand 0.8.5", + "sha1 0.10.5", "thiserror", "url", "utf-8", @@ -5053,6 +5821,16 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" +[[package]] +name = "universal-hash" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8326b2c654932e3e4f9196e69d08fdf7cfd718e1dc6f66b347e6024a0c961402" +dependencies = [ + "generic-array", + "subtle", +] + [[package]] name = "untrusted" version = "0.7.1" @@ -5130,7 +5908,7 @@ dependencies = [ "pin-project-lite", "postgres_connection", "pq_proto", - "rand", + "rand 0.8.5", "regex", "routerify", "sentry", @@ -5158,7 +5936,7 @@ version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2" dependencies = [ - "getrandom", + "getrandom 0.2.9", "serde", ] @@ -5168,6 +5946,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "value-bag" +version = "1.0.0-alpha.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55" +dependencies = [ + "ctor", + "version_check", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -5208,6 +5996,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3c4517f54858c779bbcbf228f4fca63d121bf85fbecb2dc578cdf4a39395690" + [[package]] name = "wal_craft" version = "0.1.0" @@ -5245,6 +6039,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -5317,6 +6117,19 @@ version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93" +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "wasm-timer" version = "0.2.5" @@ -5597,6 +6410,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-executor", + "futures-io", "futures-sink", "futures-util", "hex", @@ -5610,7 +6424,7 @@ dependencies = [ "num-integer", "num-traits", "prost", - "rand", + "rand 0.8.5", "regex", "regex-syntax 0.7.2", "reqwest", @@ -5621,9 +6435,11 @@ dependencies = [ "serde_json", "smallvec", "socket2 0.4.9", + "standback", "syn 1.0.109", "syn 2.0.28", - "time", + "time 0.3.21", + "time-macros 0.2.9", "tokio", "tokio-rustls", "tokio-util", @@ -5651,7 +6467,7 @@ dependencies = [ "oid-registry", "rusticata-macros", "thiserror", - "time", + "time 0.3.21", ] [[package]] @@ -5675,7 +6491,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e17bb3549cc1321ae1296b9cdc2698e2b6cb1992adfa19a8c72e5b7a738f44cd" dependencies = [ - "time", + "time 0.3.21", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 2b9da977e5..4827652229 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,9 @@ license = "Apache-2.0" [workspace.dependencies] anyhow = { version = "1.0", features = ["backtrace"] } async-compression = { version = "0.4.0", features = ["tokio", "gzip"] } +azure_core = "0.16.0" +azure_storage = "0.16" +azure_storage_blobs = "0.16.0" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" @@ -76,6 +79,7 @@ hex = "0.4" hex-literal = "0.4" hmac = "0.12.1" hostname = "0.3.1" +http-types = "2" humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 186d49fe8b..c0016ece17 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -86,7 +86,7 @@ where .stdout(process_log_file) .stderr(same_file_for_stderr) .args(args); - let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command)); + let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command)); filled_cmd.envs(envs); let pid_file_to_check = match initial_pid_file { @@ -238,11 +238,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command { filled_cmd } -fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command { +fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command { for env_key in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", + "AZURE_STORAGE_ACCOUNT", + "AZURE_STORAGE_ACCESS_KEY", ] { if let Ok(value) = std::env::var(env_key) { cmd = cmd.env(env_key, value); diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md index fc259c8a5f..ba5d3c423e 100644 --- a/docs/pageserver-services.md +++ b/docs/pageserver-services.md @@ -96,6 +96,16 @@ prefix_in_bucket = '/test_prefix/' `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed. +or + +```toml +[remote_storage] +container_name = 'some-container-name' +container_region = 'us-east' +prefix_in_container = '/test-prefix/' +``` + +`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed. ## Repository background tasks diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index d938648750..5d3bda70af 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,6 +13,7 @@ aws-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true aws-credential-types.workspace = true +bytes.workspace = true camino.workspace = true hyper = { workspace = true, features = ["stream"] } serde.workspace = true @@ -26,6 +27,12 @@ metrics.workspace = true utils.workspace = true pin-project-lite.workspace = true workspace_hack.workspace = true +azure_core.workspace = true +azure_storage.workspace = true +azure_storage_blobs.workspace = true +futures-util.workspace = true +http-types.workspace = true +itertools.workspace = true [dev-dependencies] camino-tempfile.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs new file mode 100644 index 0000000000..ac2664b81a --- /dev/null +++ b/libs/remote_storage/src/azure_blob.rs @@ -0,0 +1,381 @@ +//! Azure Blob Storage wrapper + +use std::num::NonZeroU32; +use std::{borrow::Cow, collections::HashMap, io::Cursor}; + +use super::REMOTE_STORAGE_PREFIX_SEPARATOR; +use anyhow::Result; +use azure_core::request_options::{MaxResults, Metadata, Range}; +use azure_core::Header; +use azure_storage::StorageCredentials; +use azure_storage_blobs::prelude::ClientBuilder; +use azure_storage_blobs::{ + blob::operations::GetBlobBuilder, + prelude::{BlobClient, ContainerClient}, +}; +use futures_util::StreamExt; +use http_types::StatusCode; +use tokio::io::AsyncRead; +use tracing::debug; + +use crate::s3_bucket::RequestKind; +use crate::{ + AzureConfig, ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, + StorageMetadata, +}; + +pub struct AzureBlobStorage { + client: ContainerClient, + prefix_in_container: Option, + max_keys_per_list_response: Option, + concurrency_limiter: ConcurrencyLimiter, +} + +impl AzureBlobStorage { + pub fn new(azure_config: &AzureConfig) -> Result { + debug!( + "Creating azure remote storage for azure container {}", + azure_config.container_name + ); + + let account = + std::env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT"); + let access_key = + std::env::var("AZURE_STORAGE_ACCESS_KEY").expect("missing AZURE_STORAGE_ACCESS_KEY"); + + let credentials = StorageCredentials::access_key(account.clone(), access_key); + + let builder = ClientBuilder::new(account, credentials); + + let client = builder.container_client(azure_config.container_name.to_owned()); + + let max_keys_per_list_response = + if let Some(limit) = azure_config.max_keys_per_list_response { + Some( + NonZeroU32::new(limit as u32) + .ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?, + ) + } else { + None + }; + + Ok(AzureBlobStorage { + client, + prefix_in_container: azure_config.prefix_in_container.to_owned(), + max_keys_per_list_response, + concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), + }) + } + + pub fn relative_path_to_name(&self, path: &RemotePath) -> String { + assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path + .get_path() + .as_str() + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR); + match &self.prefix_in_container { + Some(prefix) => { + if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + prefix.clone() + path_string + } else { + format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}") + } + } + None => path_string.to_string(), + } + } + + fn name_to_relative_path(&self, key: &str) -> RemotePath { + let relative_path = + match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) { + Some(stripped) => stripped, + // we rely on Azure to return properly prefixed paths + // for requests with a certain prefix + None => panic!( + "Key {key} does not start with container prefix {:?}", + self.prefix_in_container + ), + }; + RemotePath( + relative_path + .split(REMOTE_STORAGE_PREFIX_SEPARATOR) + .collect(), + ) + } + + async fn download_for_builder( + &self, + metadata: StorageMetadata, + builder: GetBlobBuilder, + ) -> Result { + let mut response = builder.into_stream(); + + // TODO give proper streaming response instead of buffering into RAM + // https://github.com/neondatabase/neon/issues/5563 + let mut buf = Vec::new(); + while let Some(part) = response.next().await { + let part = match part { + Ok(l) => l, + Err(e) => { + return Err(if let Some(http_err) = e.as_http_error() { + match http_err.status() { + StatusCode::NotFound => DownloadError::NotFound, + StatusCode::BadRequest => { + DownloadError::BadInput(anyhow::Error::new(e)) + } + _ => DownloadError::Other(anyhow::Error::new(e)), + } + } else { + DownloadError::Other(e.into()) + }); + } + }; + let data = part + .data + .collect() + .await + .map_err(|e| DownloadError::Other(e.into()))?; + buf.extend_from_slice(&data.slice(..)); + } + Ok(Download { + download_stream: Box::pin(Cursor::new(buf)), + metadata: Some(metadata), + }) + } + // TODO get rid of this function once we have metadata included in the response + // https://github.com/Azure/azure-sdk-for-rust/issues/1439 + async fn get_metadata( + &self, + blob_client: &BlobClient, + ) -> Result { + let builder = blob_client.get_metadata(); + + match builder.into_future().await { + Ok(r) => { + let mut map = HashMap::new(); + + for md in r.metadata.iter() { + map.insert( + md.name().as_str().to_string(), + md.value().as_str().to_string(), + ); + } + Ok(StorageMetadata(map)) + } + Err(e) => { + return Err(if let Some(http_err) = e.as_http_error() { + match http_err.status() { + StatusCode::NotFound => DownloadError::NotFound, + StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)), + _ => DownloadError::Other(anyhow::Error::new(e)), + } + } else { + DownloadError::Other(e.into()) + }); + } + } + } + + async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { + self.concurrency_limiter + .acquire(kind) + .await + .expect("semaphore is never closed") + } +} + +fn to_azure_metadata(metadata: StorageMetadata) -> Metadata { + let mut res = Metadata::new(); + for (k, v) in metadata.0.into_iter() { + res.insert(k, v); + } + res +} + +#[async_trait::async_trait] +impl RemoteStorage for AzureBlobStorage { + async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { + // get the passed prefix or if it is not set use prefix_in_bucket value + let list_prefix = prefix + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()) + .map(|mut p| { + // required to end with a separator + // otherwise request will return only the entry of a prefix + if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) { + p.push(REMOTE_STORAGE_PREFIX_SEPARATOR); + } + p + }); + + let mut builder = self + .client + .list_blobs() + .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()); + + if let Some(prefix) = list_prefix { + builder = builder.prefix(Cow::from(prefix.to_owned())); + } + + if let Some(limit) = self.max_keys_per_list_response { + builder = builder.max_results(MaxResults::new(limit)); + } + + let mut response = builder.into_stream(); + let mut res = Vec::new(); + while let Some(l) = response.next().await { + let entry = match l { + Ok(l) => l, + Err(e) => { + return Err(if let Some(http_err) = e.as_http_error() { + match http_err.status() { + StatusCode::NotFound => DownloadError::NotFound, + StatusCode::BadRequest => { + DownloadError::BadInput(anyhow::Error::new(e)) + } + _ => DownloadError::Other(anyhow::Error::new(e)), + } + } else { + DownloadError::Other(e.into()) + }); + } + }; + let name_iter = entry + .blobs + .prefixes() + .map(|prefix| self.name_to_relative_path(&prefix.name)); + res.extend(name_iter); + } + Ok(res) + } + + async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + let folder_name = folder + .map(|p| self.relative_path_to_name(p)) + .or_else(|| self.prefix_in_container.clone()); + + let mut builder = self.client.list_blobs(); + + if let Some(folder_name) = folder_name { + builder = builder.prefix(Cow::from(folder_name.to_owned())); + } + + if let Some(limit) = self.max_keys_per_list_response { + builder = builder.max_results(MaxResults::new(limit)); + } + + let mut response = builder.into_stream(); + let mut res = Vec::new(); + while let Some(l) = response.next().await { + let entry = l.map_err(anyhow::Error::new)?; + let name_iter = entry + .blobs + .blobs() + .map(|bl| self.name_to_relative_path(&bl.name)); + res.extend(name_iter); + } + Ok(res) + } + + async fn upload( + &self, + mut from: impl AsyncRead + Unpin + Send + Sync + 'static, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + let _permit = self.permit(RequestKind::Put).await; + let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + + // TODO FIX THIS UGLY HACK and don't buffer the entire object + // into RAM here, but use the streaming interface. For that, + // we'd have to change the interface though... + // https://github.com/neondatabase/neon/issues/5563 + let mut buf = Vec::with_capacity(data_size_bytes); + tokio::io::copy(&mut from, &mut buf).await?; + let body = azure_core::Body::Bytes(buf.into()); + + let mut builder = blob_client.put_block_blob(body); + + if let Some(metadata) = metadata { + builder = builder.metadata(to_azure_metadata(metadata)); + } + + let _response = builder.into_future().await?; + + Ok(()) + } + + async fn download(&self, from: &RemotePath) -> Result { + let _permit = self.permit(RequestKind::Get).await; + let blob_client = self.client.blob_client(self.relative_path_to_name(from)); + + let metadata = self.get_metadata(&blob_client).await?; + + let builder = blob_client.get(); + + self.download_for_builder(metadata, builder).await + } + + async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + let _permit = self.permit(RequestKind::Get).await; + let blob_client = self.client.blob_client(self.relative_path_to_name(from)); + + let metadata = self.get_metadata(&blob_client).await?; + + let mut builder = blob_client.get(); + + if let Some(end_exclusive) = end_exclusive { + builder = builder.range(Range::new(start_inclusive, end_exclusive)); + } else { + // Open ranges are not supported by the SDK so we work around + // by setting the upper limit extremely high (but high enough + // to still be representable by signed 64 bit integers). + // TODO remove workaround once the SDK adds open range support + // https://github.com/Azure/azure-sdk-for-rust/issues/1438 + let end_exclusive = u64::MAX << 4; + builder = builder.range(Range::new(start_inclusive, end_exclusive)); + } + + self.download_for_builder(metadata, builder).await + } + + async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + let _permit = self.permit(RequestKind::Delete).await; + let blob_client = self.client.blob_client(self.relative_path_to_name(path)); + + let builder = blob_client.delete(); + + match builder.into_future().await { + Ok(_response) => Ok(()), + Err(e) => { + if let Some(http_err) = e.as_http_error() { + if http_err.status() == StatusCode::NotFound { + return Ok(()); + } + } + Err(anyhow::Error::new(e)) + } + } + } + + async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { + // Permit is already obtained by inner delete function + + // TODO batch requests are also not supported by the SDK + // https://github.com/Azure/azure-sdk-for-rust/issues/1068 + // https://github.com/Azure/azure-sdk-for-rust/issues/1249 + for path in paths { + self.delete(path).await?; + } + Ok(()) + } +} diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 3560c94c71..435364d83a 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -4,7 +4,10 @@ //! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations: //! * [`local_fs`] allows to use local file system as an external storage //! * [`s3_bucket`] uses AWS S3 bucket as an external storage +//! * [`azure_blob`] allows to use Azure Blob storage as an external storage //! + +mod azure_blob; mod local_fs; mod s3_bucket; mod simulate_failures; @@ -21,11 +24,15 @@ use anyhow::{bail, Context}; use camino::{Utf8Path, Utf8PathBuf}; use serde::{Deserialize, Serialize}; -use tokio::io; +use tokio::{io, sync::Semaphore}; use toml_edit::Item; use tracing::info; -pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper}; +pub use self::{ + azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket, + simulate_failures::UnreliableWrapper, +}; +use s3_bucket::RequestKind; /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage. /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency @@ -39,6 +46,11 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +/// We set this a little bit low as we currently buffer the entire file into RAM +/// +/// Here, a limit of max 20k concurrent connections was noted. +/// +pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30; /// No limits on the client side, which currenltly means 1000 for AWS S3. /// pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; @@ -217,6 +229,7 @@ impl std::error::Error for DownloadError {} pub enum GenericRemoteStorage { LocalFs(LocalFs), AwsS3(Arc), + AzureBlob(Arc), Unreliable(Arc), } @@ -228,6 +241,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.list_files(folder).await, Self::AwsS3(s) => s.list_files(folder).await, + Self::AzureBlob(s) => s.list_files(folder).await, Self::Unreliable(s) => s.list_files(folder).await, } } @@ -242,6 +256,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.list_prefixes(prefix).await, Self::AwsS3(s) => s.list_prefixes(prefix).await, + Self::AzureBlob(s) => s.list_prefixes(prefix).await, Self::Unreliable(s) => s.list_prefixes(prefix).await, } } @@ -256,6 +271,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await, Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, } } @@ -264,6 +280,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.download(from).await, Self::AwsS3(s) => s.download(from).await, + Self::AzureBlob(s) => s.download(from).await, Self::Unreliable(s) => s.download(from).await, } } @@ -283,6 +300,10 @@ impl GenericRemoteStorage { s.download_byte_range(from, start_inclusive, end_exclusive) .await } + Self::AzureBlob(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } Self::Unreliable(s) => { s.download_byte_range(from, start_inclusive, end_exclusive) .await @@ -294,6 +315,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.delete(path).await, Self::AwsS3(s) => s.delete(path).await, + Self::AzureBlob(s) => s.delete(path).await, Self::Unreliable(s) => s.delete(path).await, } } @@ -302,6 +324,7 @@ impl GenericRemoteStorage { match self { Self::LocalFs(s) => s.delete_objects(paths).await, Self::AwsS3(s) => s.delete_objects(paths).await, + Self::AzureBlob(s) => s.delete_objects(paths).await, Self::Unreliable(s) => s.delete_objects(paths).await, } } @@ -319,6 +342,11 @@ impl GenericRemoteStorage { s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint); Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?)) } + RemoteStorageKind::AzureContainer(azure_config) => { + info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'", + azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); + Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?)) + } }) } @@ -383,6 +411,9 @@ pub enum RemoteStorageKind { /// AWS S3 based storage, storing all files in the S3 bucket /// specified by the config AwsS3(S3Config), + /// Azure Blob based storage, storing all files in the container + /// specified by the config + AzureContainer(AzureConfig), } /// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write). @@ -422,11 +453,45 @@ impl Debug for S3Config { } } +/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write). +#[derive(Clone, PartialEq, Eq)] +pub struct AzureConfig { + /// Name of the container to connect to. + pub container_name: String, + /// The region where the bucket is located at. + pub container_region: String, + /// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once. + pub prefix_in_container: Option, + /// Azure has various limits on its API calls, we need not to exceed those. + /// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details. + pub concurrency_limit: NonZeroUsize, + pub max_keys_per_list_response: Option, +} + +impl Debug for AzureConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AzureConfig") + .field("bucket_name", &self.container_name) + .field("bucket_region", &self.container_region) + .field("prefix_in_bucket", &self.prefix_in_container) + .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) + .finish() + } +} + impl RemoteStorageConfig { pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { let local_path = toml.get("local_path"); let bucket_name = toml.get("bucket_name"); let bucket_region = toml.get("bucket_region"); + let container_name = toml.get("container_name"); + let container_region = toml.get("container_region"); + + let use_azure = container_name.is_some() && container_region.is_some(); let max_concurrent_syncs = NonZeroUsize::new( parse_optional_integer("max_concurrent_syncs", toml)? @@ -440,9 +505,13 @@ impl RemoteStorageConfig { ) .context("Failed to parse 'max_sync_errors' as a positive integer")?; + let default_concurrency_limit = if use_azure { + DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT + } else { + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT + }; let concurrency_limit = NonZeroUsize::new( - parse_optional_integer("concurrency_limit", toml)? - .unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT), + parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit), ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; @@ -451,33 +520,70 @@ impl RemoteStorageConfig { .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); - let storage = match (local_path, bucket_name, bucket_region) { + let endpoint = toml + .get("endpoint") + .map(|endpoint| parse_toml_string("endpoint", endpoint)) + .transpose()?; + + let storage = match ( + local_path, + bucket_name, + bucket_region, + container_name, + container_region, + ) { // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled - (None, None, None) => return Ok(None), - (_, Some(_), None) => { + (None, None, None, None, None) => return Ok(None), + (_, Some(_), None, ..) => { bail!("'bucket_region' option is mandatory if 'bucket_name' is given ") } - (_, None, Some(_)) => { + (_, None, Some(_), ..) => { bail!("'bucket_name' option is mandatory if 'bucket_region' is given ") } - (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config { - bucket_name: parse_toml_string("bucket_name", bucket_name)?, - bucket_region: parse_toml_string("bucket_region", bucket_region)?, - prefix_in_bucket: toml - .get("prefix_in_bucket") - .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket)) - .transpose()?, - endpoint: toml - .get("endpoint") - .map(|endpoint| parse_toml_string("endpoint", endpoint)) - .transpose()?, - concurrency_limit, - max_keys_per_list_response, - }), - (Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from( - parse_toml_string("local_path", local_path)?, - )), - (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"), + (None, Some(bucket_name), Some(bucket_region), ..) => { + RemoteStorageKind::AwsS3(S3Config { + bucket_name: parse_toml_string("bucket_name", bucket_name)?, + bucket_region: parse_toml_string("bucket_region", bucket_region)?, + prefix_in_bucket: toml + .get("prefix_in_bucket") + .map(|prefix_in_bucket| { + parse_toml_string("prefix_in_bucket", prefix_in_bucket) + }) + .transpose()?, + endpoint, + concurrency_limit, + max_keys_per_list_response, + }) + } + (_, _, _, Some(_), None) => { + bail!("'container_name' option is mandatory if 'container_region' is given ") + } + (_, _, _, None, Some(_)) => { + bail!("'container_name' option is mandatory if 'container_region' is given ") + } + (None, None, None, Some(container_name), Some(container_region)) => { + RemoteStorageKind::AzureContainer(AzureConfig { + container_name: parse_toml_string("container_name", container_name)?, + container_region: parse_toml_string("container_region", container_region)?, + prefix_in_container: toml + .get("prefix_in_container") + .map(|prefix_in_container| { + parse_toml_string("prefix_in_container", prefix_in_container) + }) + .transpose()?, + concurrency_limit, + max_keys_per_list_response, + }) + } + (Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs( + Utf8PathBuf::from(parse_toml_string("local_path", local_path)?), + ), + (Some(_), Some(_), ..) => { + bail!("'local_path' and 'bucket_name' are mutually exclusive") + } + (Some(_), _, _, Some(_), Some(_)) => { + bail!("local_path and 'container_name' are mutually exclusive") + } }; Ok(Some(RemoteStorageConfig { @@ -513,6 +619,46 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result { Ok(s.to_string()) } +struct ConcurrencyLimiter { + // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. + // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. + // The helps to ensure we don't exceed the thresholds. + write: Arc, + read: Arc, +} + +impl ConcurrencyLimiter { + fn for_kind(&self, kind: RequestKind) -> &Arc { + match kind { + RequestKind::Get => &self.read, + RequestKind::Put => &self.write, + RequestKind::List => &self.read, + RequestKind::Delete => &self.write, + } + } + + async fn acquire( + &self, + kind: RequestKind, + ) -> Result, tokio::sync::AcquireError> { + self.for_kind(kind).acquire().await + } + + async fn acquire_owned( + &self, + kind: RequestKind, + ) -> Result { + Arc::clone(self.for_kind(kind)).acquire_owned().await + } + + fn new(limit: usize) -> ConcurrencyLimiter { + Self { + read: Arc::new(Semaphore::new(limit)), + write: Arc::new(Semaphore::new(limit)), + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index c63f24cf85..fc94281666 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -4,7 +4,7 @@ //! allowing multiple api users to independently work with the same S3 bucket, if //! their bucket prefixes are both specified and different. -use std::{borrow::Cow, sync::Arc}; +use std::borrow::Cow; use anyhow::Context; use aws_config::{ @@ -24,22 +24,20 @@ use aws_sdk_s3::{ use aws_smithy_http::body::SdkBody; use hyper::Body; use scopeguard::ScopeGuard; -use tokio::{ - io::{self, AsyncRead}, - sync::Semaphore, -}; +use tokio::io::{self, AsyncRead}; use tokio_util::io::ReaderStream; use tracing::debug; use super::StorageMetadata; use crate::{ - Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE, - REMOTE_STORAGE_PREFIX_SEPARATOR, + ConcurrencyLimiter, Download, DownloadError, RemotePath, RemoteStorage, S3Config, + MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; pub(super) mod metrics; -use self::metrics::{AttemptOutcome, RequestKind}; +use self::metrics::AttemptOutcome; +pub(super) use self::metrics::RequestKind; /// AWS S3 storage. pub struct S3Bucket { @@ -50,46 +48,6 @@ pub struct S3Bucket { concurrency_limiter: ConcurrencyLimiter, } -struct ConcurrencyLimiter { - // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. - // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. - // The helps to ensure we don't exceed the thresholds. - write: Arc, - read: Arc, -} - -impl ConcurrencyLimiter { - fn for_kind(&self, kind: RequestKind) -> &Arc { - match kind { - RequestKind::Get => &self.read, - RequestKind::Put => &self.write, - RequestKind::List => &self.read, - RequestKind::Delete => &self.write, - } - } - - async fn acquire( - &self, - kind: RequestKind, - ) -> Result, tokio::sync::AcquireError> { - self.for_kind(kind).acquire().await - } - - async fn acquire_owned( - &self, - kind: RequestKind, - ) -> Result { - Arc::clone(self.for_kind(kind)).acquire_owned().await - } - - fn new(limit: usize) -> ConcurrencyLimiter { - Self { - read: Arc::new(Semaphore::new(limit)), - write: Arc::new(Semaphore::new(limit)), - } - } -} - #[derive(Default)] struct GetObjectRequest { bucket: String, diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs index 2068ca0e38..ea11edafa5 100644 --- a/libs/remote_storage/src/s3_bucket/metrics.rs +++ b/libs/remote_storage/src/s3_bucket/metrics.rs @@ -6,7 +6,7 @@ use once_cell::sync::Lazy; pub(super) static BUCKET_METRICS: Lazy = Lazy::new(Default::default); #[derive(Clone, Copy, Debug)] -pub(super) enum RequestKind { +pub(crate) enum RequestKind { Get = 0, Put = 1, Delete = 2, diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs new file mode 100644 index 0000000000..5ebbd9e95b --- /dev/null +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -0,0 +1,619 @@ +use std::collections::HashSet; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::ops::ControlFlow; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use camino::Utf8Path; +use once_cell::sync::OnceCell; +use remote_storage::{ + AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, +}; +use test_context::{test_context, AsyncTestContext}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +static LOGGING_DONE: OnceCell<()> = OnceCell::new(); + +const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE"; + +const BASE_PREFIX: &str = "test"; + +/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries. +/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. +/// See the client creation in [`create_azure_client`] for details on the required env vars. +/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`] +/// where +/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure. +/// +/// Lastly, the test attempts to clean up and remove all uploaded Azure files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledAzureWithTestBlobs)] +#[tokio::test] +async fn azure_pagination_should_work( + ctx: &mut MaybeEnabledAzureWithTestBlobs, +) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()), + MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("Azure init failed: {e:?}") + } + }; + + let test_client = Arc::clone(&ctx.enabled.client); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix)) + .context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list_prefixes(None) + .await + .context("client list root prefixes failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list_prefixes(Some(&base_prefix)) + .await + .context("client list nested prefixes failure")? + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries. +/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set. +/// See `Azure_pagination_should_work` for more information. +/// +/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`] +/// Then performs the following queries: +/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt` +/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt` +#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)] +#[tokio::test] +async fn azure_list_files_works( + ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs, +) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx, + MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()), + MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => { + anyhow::bail!("Azure init failed: {e:?}") + } + }; + let test_client = Arc::clone(&ctx.enabled.client); + let base_prefix = + RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?; + let root_files = test_client + .list_files(None) + .await + .context("client list root files failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_files, + ctx.remote_blobs.clone(), + "remote storage list_files on root mismatches with the uploads." + ); + let nested_remote_files = test_client + .list_files(Some(&base_prefix)) + .await + .context("client list nested files failure")? + .into_iter() + .collect::>(); + let trim_remote_blobs: HashSet<_> = ctx + .remote_blobs + .iter() + .map(|x| x.get_path()) + .filter(|x| x.starts_with("folder1")) + .map(|x| RemotePath::new(x).expect("must be valid path")) + .collect(); + assert_eq!( + nested_remote_files, trim_remote_blobs, + "remote storage list_files on subdirrectory mismatches with the uploads." + ); + Ok(()) +} + +#[test_context(MaybeEnabledAzure)] +#[tokio::test] +async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledAzure::Enabled(ctx) => ctx, + MaybeEnabledAzure::Disabled => return Ok(()), + }; + + let path = RemotePath::new(Utf8Path::new( + format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(), + )) + .with_context(|| "RemotePath conversion")?; + + ctx.client.delete(&path).await.expect("should succeed"); + + Ok(()) +} + +#[test_context(MaybeEnabledAzure)] +#[tokio::test] +async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledAzure::Enabled(ctx) => ctx, + MaybeEnabledAzure::Disabled => return Ok(()), + }; + + let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let data1 = "remote blob data1".as_bytes(); + let data1_len = data1.len(); + let data2 = "remote blob data2".as_bytes(); + let data2_len = data2.len(); + let data3 = "remote blob data3".as_bytes(); + let data3_len = data3.len(); + ctx.client + .upload(std::io::Cursor::new(data1), data1_len, &path1, None) + .await?; + + ctx.client + .upload(std::io::Cursor::new(data2), data2_len, &path2, None) + .await?; + + ctx.client + .upload(std::io::Cursor::new(data3), data3_len, &path3, None) + .await?; + + ctx.client.delete_objects(&[path1, path2]).await?; + + let prefixes = ctx.client.list_prefixes(None).await?; + + assert_eq!(prefixes.len(), 1); + + ctx.client.delete_objects(&[path3]).await?; + + Ok(()) +} + +#[test_context(MaybeEnabledAzure)] +#[tokio::test] +async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> { + let MaybeEnabledAzure::Enabled(ctx) = ctx else { + return Ok(()); + }; + + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str())) + .with_context(|| "RemotePath conversion")?; + + let data = "remote blob data here".as_bytes(); + let data_len = data.len() as u64; + + ctx.client + .upload(std::io::Cursor::new(data), data.len(), &path, None) + .await?; + + async fn download_and_compare(mut dl: Download) -> anyhow::Result> { + let mut buf = Vec::new(); + tokio::io::copy(&mut dl.download_stream, &mut buf).await?; + Ok(buf) + } + // Normal download request + let dl = ctx.client.download(&path).await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data); + + // Full range (end specified) + let dl = ctx + .client + .download_byte_range(&path, 0, Some(data_len)) + .await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data); + + // partial range (end specified) + let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data[4..10]); + + // partial range (end beyond real end) + let dl = ctx + .client + .download_byte_range(&path, 8, Some(data_len * 100)) + .await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data[8..]); + + // Partial range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 4, None).await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data[4..]); + + // Full range (end unspecified) + let dl = ctx.client.download_byte_range(&path, 0, None).await?; + let buf = download_and_compare(dl).await?; + assert_eq!(buf, data); + + Ok(()) +} + +fn ensure_logging_ready() { + LOGGING_DONE.get_or_init(|| { + utils::logging::init( + utils::logging::LogFormat::Test, + utils::logging::TracingErrorLayerEnablement::Disabled, + ) + .expect("logging init failed"); + }); +} + +struct EnabledAzure { + client: Arc, + base_prefix: &'static str, +} + +impl EnabledAzure { + async fn setup(max_keys_in_list_response: Option) -> Self { + let client = create_azure_client(max_keys_in_list_response) + .context("Azure client creation") + .expect("Azure client creation failed"); + + EnabledAzure { + client, + base_prefix: BASE_PREFIX, + } + } +} + +enum MaybeEnabledAzure { + Enabled(EnabledAzure), + Disabled, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledAzure { + async fn setup() -> Self { + ensure_logging_ready(); + + if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + Self::Enabled(EnabledAzure::setup(None).await) + } +} + +enum MaybeEnabledAzureWithTestBlobs { + Enabled(AzureWithTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, AzureWithTestBlobs), +} + +struct AzureWithTestBlobs { + enabled: EnabledAzure, + remote_prefixes: HashSet, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs { + async fn setup() -> Self { + ensure_logging_ready(); + if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; + + match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + + Self::Enabled(AzureWithTestBlobs { + enabled, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to Azure"), + AzureWithTestBlobs { + enabled, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.enabled.client, ctx.remote_blobs).await; + } + } + } +} + +// NOTE: the setups for the list_prefixes test and the list_files test are very similar +// However, they are not idential. The list_prefixes function is concerned with listing prefixes, +// whereas the list_files function is concerned with listing files. +// See `RemoteStorage::list_files` documentation for more details +enum MaybeEnabledAzureWithSimpleTestBlobs { + Enabled(AzureWithSimpleTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs), +} +struct AzureWithSimpleTestBlobs { + enabled: EnabledAzure, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs { + async fn setup() -> Self { + ensure_logging_ready(); + if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await; + + match upload_simple_azure_data(&enabled.client, upload_tasks_count).await { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + + Self::Enabled(AzureWithSimpleTestBlobs { + enabled, + remote_blobs: uploads, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to Azure"), + AzureWithSimpleTestBlobs { + enabled, + remote_blobs: uploads, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.enabled.client, ctx.remote_blobs).await; + } + } + } +} + +fn create_azure_client( + max_keys_per_list_response: Option, +) -> anyhow::Result> { + use rand::Rng; + + let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context( + "`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled", + )?; + let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context( + "`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled", + )?; + + // due to how time works, we've had test runners use the same nanos as bucket prefixes. + // millis is just a debugging aid for easier finding the prefix later. + let millis = std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("random Azure test prefix part calculation")? + .as_millis(); + + // because nanos can be the same for two threads so can millis, add randomness + let random = rand::thread_rng().gen::(); + + let remote_storage_config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), + max_sync_errors: NonZeroU32::new(5).unwrap(), + storage: RemoteStorageKind::AzureContainer(AzureConfig { + container_name: remote_storage_azure_container, + container_region: remote_storage_azure_region, + prefix_in_container: Some(format!("test_{millis}_{random:08x}/")), + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response, + }), + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + )) +} + +struct Uploads { + prefixes: HashSet, + blobs: HashSet, +} + +async fn upload_azure_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} Azure files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = format!("{base_prefix_str}/sub_prefix_{i}/"); + let blob_prefix = RemotePath::new(Utf8Path::new(&prefix)) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) + .await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +async fn cleanup(client: &Arc, objects_to_delete: HashSet) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} + +// Uploads files `folder{j}/blob{i}.txt`. See test description for more details. +async fn upload_simple_azure_data( + client: &Arc, + upload_tasks_count: usize, +) -> ControlFlow, HashSet> { + info!("Creating {upload_tasks_count} Azure files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i)); + let blob_path = RemotePath::new( + Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"), + ) + .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?; + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload(std::io::Cursor::new(data), data_len, &blob_path, None) + .await?; + + Ok::<_, anyhow::Error>(blob_path) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok(upload_path) => { + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + if upload_tasks_failed { + ControlFlow::Break(uploaded_blobs) + } else { + ControlFlow::Continue(uploaded_blobs) + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b2303869f2..4b250822e5 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -31,13 +31,14 @@ futures = { version = "0.3" } futures-channel = { version = "0.3", features = ["sink"] } futures-core = { version = "0.3" } futures-executor = { version = "0.3" } +futures-io = { version = "0.3" } futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } hex = { version = "0.4", features = ["serde"] } hyper = { version = "0.14", features = ["full"] } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } -log = { version = "0.4", default-features = false, features = ["std"] } +log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } memchr = { version = "2" } nom = { version = "7" } num-bigint = { version = "0.4" } @@ -47,7 +48,7 @@ prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-syntax = { version = "0.7" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["blocking", "default-tls", "json", "multipart", "rustls-tls", "stream"] } ring = { version = "0.16", features = ["std"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } @@ -55,7 +56,8 @@ serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } smallvec = { version = "1", default-features = false, features = ["write"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } -time = { version = "0.3", features = ["formatting", "macros", "parsing"] } +standback = { version = "0.2", default-features = false, features = ["std"] } +time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "io"] } @@ -75,14 +77,16 @@ cc = { version = "1", default-features = false, features = ["parallel"] } either = { version = "1" } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] } -log = { version = "0.4", default-features = false, features = ["std"] } +log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] } memchr = { version = "2" } nom = { version = "7" } prost = { version = "0.11" } regex = { version = "1" } regex-syntax = { version = "0.7" } serde = { version = "1", features = ["alloc", "derive"] } +standback = { version = "0.2", default-features = false, features = ["std"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] } syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] } ### END HAKARI SECTION From ded7f4856521dd0ab94d672e89bf9a583178dfe5 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 16 Oct 2023 18:21:37 +0100 Subject: [PATCH 15/18] pageserver: measure startup duration spent fetching remote indices (#5564) ## Problem Currently it's unclear how much of the `initial_tenant_load` period is in S3 objects, and therefore how impactful it is to make changes to remote operations during startup. ## Summary of changes - `Tenant::load` is refactored to load remote indices in parallel and to wait for all these remote downloads to finish before it proceeds to construct any `Timeline` objects. - `pageserver_startup_duration_seconds` gets a new `phase` value of `initial_tenant_load_remote` which counts the time from startup to when the last tenant finishes loading remote content. - `test_pageserver_restart` is extended to validate this phase. The previous version of the test was relying on order of dict entries, which stopped working when adding a phase, so this is refactored a bit. - `test_pageserver_restart` used to explicitly create a branch, now it uses the default initial_timeline. This avoids startup getting held up waiting for logical sizes, when one of the branches is not in use. --- pageserver/src/bin/pageserver.rs | 7 +- pageserver/src/lib.rs | 3 + pageserver/src/tenant.rs | 234 +++++++++++++----- pageserver/src/tenant/delete.rs | 2 +- .../regress/test_pageserver_restart.py | 55 ++-- 5 files changed, 219 insertions(+), 82 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7e2b376212..827f74fcce 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -355,6 +355,7 @@ fn start_pageserver( // consumer side) will be dropped once we can start the background jobs. Currently it is behind // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout // (background_task_maximum_delay). + let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel(); let (init_done_tx, init_done_rx) = utils::completion::channel(); let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel(); @@ -362,7 +363,8 @@ fn start_pageserver( let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel(); let order = pageserver::InitializationOrder { - initial_tenant_load: Some(init_done_tx), + initial_tenant_load_remote: Some(init_done_tx), + initial_tenant_load: Some(init_remote_done_tx), initial_logical_size_can_start: init_done_rx.clone(), initial_logical_size_attempt: Some(init_logical_size_done_tx), background_jobs_can_start: background_jobs_barrier.clone(), @@ -388,6 +390,9 @@ fn start_pageserver( // NOTE: unlike many futures in pageserver, this one is cancellation-safe let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed")); + init_remote_done_rx.wait().await; + startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed"); + init_done_rx.wait().await; startup_checkpoint("initial_tenant_load", "Initial load completed"); STARTUP_IS_LOADING.set(0); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 8199cd38e6..32d43053bd 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -173,6 +173,9 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { /// delaying is needed. #[derive(Clone)] pub struct InitializationOrder { + /// Each initial tenant load task carries this until it is done loading timelines from remote storage + pub initial_tenant_load_remote: Option, + /// Each initial tenant load task carries this until completion. pub initial_tenant_load: Option, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8f7710ffcf..88b2423eec 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -23,12 +23,14 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::completion; +use utils::completion::Completion; use utils::crashsafe::path_with_suffix_extension; use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; +use std::collections::HashSet; use std::fmt::Debug; use std::fmt::Display; use std::fs; @@ -185,6 +187,11 @@ impl AttachedTenantConf { } } } +struct TimelinePreload { + timeline_id: TimelineId, + client: RemoteTimelineClient, + index_part: Result, +} /// /// Tenant consists of multiple timelines. Keep them in a hash table. @@ -962,6 +969,9 @@ impl Tenant { let _completion = init_order .as_mut() .and_then(|x| x.initial_tenant_load.take()); + let remote_load_completion = init_order + .as_mut() + .and_then(|x| x.initial_tenant_load_remote.take()); // Dont block pageserver startup on figuring out deletion status let pending_deletion = { @@ -986,6 +996,7 @@ impl Tenant { // as we are no longer loading, signal completion by dropping // the completion while we resume deletion drop(_completion); + drop(remote_load_completion); // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout let _ = init_order .as_mut() @@ -1011,7 +1022,10 @@ impl Tenant { let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start); - match tenant_clone.load(init_order.as_ref(), &ctx).await { + match tenant_clone + .load(init_order.as_ref(), remote_load_completion, &ctx) + .await + { Ok(()) => { debug!("load finished"); @@ -1175,6 +1189,52 @@ impl Tenant { }) } + async fn load_timeline_metadata( + self: &Arc, + timeline_ids: HashSet, + remote_storage: &GenericRemoteStorage, + ) -> anyhow::Result> { + let mut part_downloads = JoinSet::new(); + for timeline_id in timeline_ids { + let client = RemoteTimelineClient::new( + remote_storage.clone(), + self.deletion_queue_client.clone(), + self.conf, + self.tenant_id, + timeline_id, + self.generation, + ); + part_downloads.spawn( + async move { + debug!("starting index part download"); + + let index_part = client.download_index_file().await; + + debug!("finished index part download"); + + Result::<_, anyhow::Error>::Ok(TimelinePreload { + client, + timeline_id, + index_part, + }) + } + .map(move |res| { + res.with_context(|| format!("download index part for timeline {timeline_id}")) + }) + .instrument(info_span!("download_index_part", %timeline_id)), + ); + } + + let mut timeline_preloads: HashMap = HashMap::new(); + while let Some(result) = part_downloads.join_next().await { + let preload_result = result.context("join preload task")?; + let preload = preload_result?; + timeline_preloads.insert(preload.timeline_id, preload); + } + + Ok(timeline_preloads) + } + /// /// Background task to load in-memory data structures for this tenant, from /// files on disk. Used at pageserver startup. @@ -1183,14 +1243,13 @@ impl Tenant { async fn load( self: &Arc, init_order: Option<&InitializationOrder>, + remote_completion: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); debug!("loading tenant task"); - crate::failpoint_support::sleep_millis_async!("before-loading-tenant"); - // Load in-memory state to reflect the local files on disk // // Scan the directory, peek into the metadata file of each timeline, and @@ -1209,10 +1268,38 @@ impl Tenant { // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" + // Load remote content for timelines in this tenant + let all_timeline_ids = scan + .sorted_timelines_to_load + .iter() + .map(|i| i.0) + .chain(scan.timelines_to_resume_deletion.iter().map(|i| i.0)) + .collect(); + let mut preload = if let Some(remote_storage) = &self.remote_storage { + Some( + self.load_timeline_metadata(all_timeline_ids, remote_storage) + .await?, + ) + } else { + None + }; + + drop(remote_completion); + + crate::failpoint_support::sleep_millis_async!("before-loading-tenant"); + // Process loadable timelines first for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { + let timeline_preload = preload.as_mut().map(|p| p.remove(&timeline_id).unwrap()); if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false) + .load_local_timeline( + timeline_id, + local_metadata, + timeline_preload, + init_order, + ctx, + false, + ) .await { match e { @@ -1245,8 +1332,17 @@ impl Tenant { } } Some(local_metadata) => { + let timeline_preload = + preload.as_mut().map(|p| p.remove(&timeline_id).unwrap()); if let Err(e) = self - .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true) + .load_local_timeline( + timeline_id, + local_metadata, + timeline_preload, + init_order, + ctx, + true, + ) .await { match e { @@ -1274,11 +1370,12 @@ impl Tenant { /// Subroutine of `load_tenant`, to load an individual timeline /// /// NB: The parent is assumed to be already loaded! - #[instrument(skip(self, local_metadata, init_order, ctx))] + #[instrument(skip(self, local_metadata, init_order, preload, ctx))] async fn load_local_timeline( self: &Arc, timeline_id: TimelineId, local_metadata: TimelineMetadata, + preload: Option, init_order: Option<&InitializationOrder>, ctx: &RequestContext, found_delete_mark: bool, @@ -1287,74 +1384,81 @@ impl Tenant { let mut resources = self.build_timeline_resources(timeline_id); - let (remote_startup_data, remote_client) = match resources.remote_client { - Some(remote_client) => match remote_client.download_index_file().await { - Ok(index_part) => { - let index_part = match index_part { - MaybeDeletedIndexPart::IndexPart(index_part) => index_part, - MaybeDeletedIndexPart::Deleted(index_part) => { - // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. - // Example: - // start deletion operation - // finishes upload of index part - // pageserver crashes - // remote storage gets de-configured - // pageserver starts - // - // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. - // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. - info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); + let (remote_startup_data, remote_client) = match preload { + Some(preload) => { + let TimelinePreload { + index_part, + client: remote_client, + timeline_id: _timeline_id, + } = preload; + match index_part { + Ok(index_part) => { + let index_part = match index_part { + MaybeDeletedIndexPart::IndexPart(index_part) => index_part, + MaybeDeletedIndexPart::Deleted(index_part) => { + // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation. + // Example: + // start deletion operation + // finishes upload of index part + // pageserver crashes + // remote storage gets de-configured + // pageserver starts + // + // We don't really anticipate remote storage to be de-configured, so, for now, this is fine. + // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099. + info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); - remote_client - .init_upload_queue_stopped_to_continue_deletion(&index_part) - .context("init queue stopped") + remote_client + .init_upload_queue_stopped_to_continue_deletion(&index_part) + .context("init queue stopped") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; + + DeleteTimelineFlow::resume_deletion( + Arc::clone(self), + timeline_id, + &local_metadata, + Some(remote_client), + self.deletion_queue_client.clone(), + init_order, + ) + .await + .context("resume deletion") .map_err(LoadLocalTimelineError::ResumeDeletion)?; - DeleteTimelineFlow::resume_deletion( - Arc::clone(self), + return Ok(()); + } + }; + + let remote_metadata = index_part.metadata.clone(); + ( + Some(RemoteStartupData { + index_part, + remote_metadata, + }), + Some(remote_client), + ) + } + Err(DownloadError::NotFound) => { + info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}"); + + if found_delete_mark { + // We could've resumed at a point where remote index was deleted, but metadata file wasnt. + // Cleanup: + return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces( + self, timeline_id, - &local_metadata, - Some(remote_client), - self.deletion_queue_client.clone(), - init_order, ) .await - .context("resume deletion") - .map_err(LoadLocalTimelineError::ResumeDeletion)?; - - return Ok(()); + .context("cleanup_remaining_timeline_fs_traces") + .map_err(LoadLocalTimelineError::ResumeDeletion); } - }; - let remote_metadata = index_part.metadata.clone(); - ( - Some(RemoteStartupData { - index_part, - remote_metadata, - }), - Some(remote_client), - ) - } - Err(DownloadError::NotFound) => { - info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}"); - - if found_delete_mark { - // We could've resumed at a point where remote index was deleted, but metadata file wasnt. - // Cleanup: - return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces( - self, - timeline_id, - ) - .await - .context("cleanup_remaining_timeline_fs_traces") - .map_err(LoadLocalTimelineError::ResumeDeletion); + // We're loading fresh timeline that didnt yet make it into remote. + (None, Some(remote_client)) } - - // We're loading fresh timeline that didnt yet make it into remote. - (None, Some(remote_client)) + Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))), } - Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))), - }, + } None => { // No remote client if found_delete_mark { @@ -3594,7 +3698,7 @@ pub mod harness { self.deletion_queue.new_client(), )); tenant - .load(None, ctx) + .load(None, None, ctx) .instrument(info_span!("try_load", tenant_id=%self.tenant_id)) .await?; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 3cb58aec6b..0db6213bbf 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -432,7 +432,7 @@ impl DeleteTenantFlow { // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir) let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id); if timelines_path.exists() { - tenant.load(init_order, ctx).await.context("load")?; + tenant.load(init_order, None, ctx).await.context("load")?; } Self::background( diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index f7dc80a6d8..b49a1a40dd 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -4,6 +4,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.remote_storage import s3_storage +from fixtures.utils import wait_until # Test restarting page server, while safekeeper and compute node keep @@ -16,8 +17,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool) env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_pageserver_restart") - endpoint = env.endpoints.create_start("test_pageserver_restart") + endpoint = env.endpoints.create_start("main") pageserver_http = env.pageserver.http_client() pg_conn = endpoint.connect() @@ -75,27 +75,52 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool) cur.execute("SELECT count(*) FROM foo") assert cur.fetchone() == (100000,) - # Validate startup time metrics - metrics = pageserver_http.get_metrics() + # Wait for metrics to indicate startup complete, so that we can know all + # startup phases will be reflected in the subsequent checks + def assert_complete(): + for sample in pageserver_http.get_metrics().query_all( + "pageserver_startup_duration_seconds" + ): + labels = dict(sample.labels) + log.info(f"metric {labels['phase']}={sample.value}") + if labels["phase"] == "complete" and sample.value > 0: + return + + raise AssertionError("No 'complete' metric yet") + + wait_until(30, 1.0, assert_complete) # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value - expectations = { - "initial": lambda t, p: True, # make no assumptions about the initial time point, it could be 0 in theory + expectations = [ + ( + "initial", + lambda t, p: True, + ), # make no assumptions about the initial time point, it could be 0 in theory + # Remote phase of initial_tenant_load should happen before overall phase is complete + ("initial_tenant_load_remote", lambda t, p: t >= 0.0 and t >= p), # Initial tenant load should reflect the delay we injected - "initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p, + ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p), # Subsequent steps should occur in expected order - "initial_logical_sizes": lambda t, p: t > 0 and t >= p, - "background_jobs_can_start": lambda t, p: t > 0 and t >= p, - "complete": lambda t, p: t > 0 and t >= p, - } + ("initial_logical_sizes", lambda t, p: t > 0 and t >= p), + ("background_jobs_can_start", lambda t, p: t > 0 and t >= p), + ("complete", lambda t, p: t > 0 and t >= p), + ] + # Accumulate the runtime of each startup phase + values = {} + metrics = pageserver_http.get_metrics() prev_value = None for sample in metrics.query_all("pageserver_startup_duration_seconds"): - labels = dict(sample.labels) - phase = labels["phase"] + phase = sample.labels["phase"] log.info(f"metric {phase}={sample.value}") - assert phase in expectations, f"Unexpected phase {phase}" - assert expectations[phase]( + assert phase in [e[0] for e in expectations], f"Unexpected phase {phase}" + values[phase] = sample + + # Apply expectations to the metrics retrieved + for phase, expectation in expectations: + assert phase in values, f"No data for phase {phase}" + sample = values[phase] + assert expectation( sample.value, prev_value ), f"Unexpected value for {phase}: {sample.value}" prev_value = sample.value From 0ca342260c734055e6e5b47c5edf954f424a8279 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 16 Oct 2023 20:46:45 +0200 Subject: [PATCH 16/18] [compute_ctl+pgxn] Handle invalid databases after failed drop (#5561) ## Problem In 89275f6c1e7b we fixed an issue, when we were dropping db in Postgres even though cplane request failed. Yet, it introduced a new problem that we now de-register db in cplane even if we didn't actually drop it in Postgres. ## Summary of changes Here we revert extension change, so we now again may leave db in invalid state after failed drop. Instead, `compute_ctl` is now responsible for cleaning up invalid databases during full configuration. Thus, there are two ways of recovering from failed DROP DATABASE: 1. User can just repeat DROP DATABASE, same as in Vanilla Postgres. 2. If they didn't, then on next full configuration (dbs / roles changes in the API; password reset; or data availability check) invalid db will be cleaned up in the Postgres and re-created by `compute_ctl`. So again it follows pretty much the same semantics as Vanilla Postgres -- you need to drop it again after failed drop. That way, we have a recovery trajectory for both problems. See this commit for info about `invalid` db state: https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 According to it: > An invalid database cannot be connected to anymore, but can still be dropped. While on it, this commit also fixes another issue, when `compute_ctl` was trying to connect to databases with `ALLOW CONNECTIONS false`. Now it will just skip them. Fixes #5435 --- compute_tools/src/compute.rs | 6 +- compute_tools/src/pg_helpers.rs | 28 +++++-- compute_tools/src/spec.rs | 77 ++++++++++++++--- libs/compute_api/src/spec.rs | 6 ++ pgxn/neon/control_plane_connector.c | 7 -- test_runner/regress/test_ddl_forwarding.py | 96 +++++++++++++++++++++- 6 files changed, 191 insertions(+), 29 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 62e541ebce..e67430d061 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -692,10 +692,11 @@ impl ComputeNode { // Proceed with post-startup configuration. Note, that order of operations is important. let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec; create_neon_superuser(spec, &mut client)?; + cleanup_instance(&mut client)?; handle_roles(spec, &mut client)?; handle_databases(spec, &mut client)?; handle_role_deletions(spec, self.connstr.as_str(), &mut client)?; - handle_grants(spec, self.connstr.as_str())?; + handle_grants(spec, &mut client, self.connstr.as_str())?; handle_extensions(spec, &mut client)?; create_availability_check_data(&mut client)?; @@ -731,10 +732,11 @@ impl ComputeNode { // Disable DDL forwarding because control plane already knows about these roles/databases. if spec.mode == ComputeMode::Primary { client.simple_query("SET neon.forward_ddl = false")?; + cleanup_instance(&mut client)?; handle_roles(&spec, &mut client)?; handle_databases(&spec, &mut client)?; handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?; - handle_grants(&spec, self.connstr.as_str())?; + handle_grants(&spec, &mut client, self.connstr.as_str())?; handle_extensions(&spec, &mut client)?; } diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index b94a97a126..b79e516650 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::fmt::Write; use std::fs; use std::fs::File; @@ -205,22 +206,37 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result> { } /// Build a list of existing Postgres databases -pub fn get_existing_dbs(client: &mut Client) -> Result> { - let postgres_dbs = client +pub fn get_existing_dbs(client: &mut Client) -> Result> { + // `pg_database.datconnlimit = -2` means that the database is in the + // invalid state. See: + // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 + let postgres_dbs: Vec = client .query( - "SELECT datname, datdba::regrole::text as owner - FROM pg_catalog.pg_database;", + "SELECT + datname AS name, + datdba::regrole::text AS owner, + NOT datallowconn AS restrict_conn, + datconnlimit = - 2 AS invalid + FROM + pg_catalog.pg_database;", &[], )? .iter() .map(|row| Database { - name: row.get("datname"), + name: row.get("name"), owner: row.get("owner"), + restrict_conn: row.get("restrict_conn"), + invalid: row.get("invalid"), options: None, }) .collect(); - Ok(postgres_dbs) + let dbs_map = postgres_dbs + .iter() + .map(|db| (db.name.clone(), db.clone())) + .collect::>(); + + Ok(dbs_map) } /// Wait for Postgres to become ready to accept connections. It's ready to diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 74e7796be7..591cbe90c0 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse}; -use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; +use compute_api::spec::{ComputeSpec, PgIdent, Role}; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -161,6 +161,38 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { Ok(()) } +/// Compute could be unexpectedly shut down, for example, during the +/// database dropping. This leaves the database in the invalid state, +/// which prevents new db creation with the same name. This function +/// will clean it up before proceeding with catalog updates. All +/// possible future cleanup operations may go here too. +#[instrument(skip_all)] +pub fn cleanup_instance(client: &mut Client) -> Result<()> { + let existing_dbs = get_existing_dbs(client)?; + + for (_, db) in existing_dbs { + if db.invalid { + // After recent commit in Postgres, interrupted DROP DATABASE + // leaves the database in the invalid state. According to the + // commit message, the only option for user is to drop it again. + // See: + // https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9 + // + // Postgres Neon extension is done the way, that db is de-registered + // in the control plane metadata only after it is dropped. So there is + // a chance that it still thinks that db should exist. This means + // that it will be re-created by `handle_databases()`. Yet, it's fine + // as user can just repeat drop (in vanilla Postgres they would need + // to do the same, btw). + let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote()); + info!("dropping invalid database {}", db.name); + client.execute(query.as_str(), &[])?; + } + } + + Ok(()) +} + /// Given a cluster spec json and open transaction it handles roles creation, /// deletion and update. #[instrument(skip_all)] @@ -379,13 +411,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent /// which together provide us idempotency. #[instrument(skip_all)] pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { - let existing_dbs: Vec = get_existing_dbs(client)?; + let existing_dbs = get_existing_dbs(client)?; // Print a list of existing Postgres databases (only in debug mode) if span_enabled!(Level::INFO) { info!("postgres databases:"); - for r in &existing_dbs { - info!(" {}:{}", r.name, r.owner); + for (dbname, db) in &existing_dbs { + info!(" {}:{}", dbname, db.owner); } } @@ -439,8 +471,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { "rename_db" => { let new_name = op.new_name.as_ref().unwrap(); - // XXX: with a limited number of roles it is fine, but consider making it a HashMap - if existing_dbs.iter().any(|r| r.name == op.name) { + if existing_dbs.get(&op.name).is_some() { let query: String = format!( "ALTER DATABASE {} RENAME TO {}", op.name.pg_quote(), @@ -457,14 +488,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { } // Refresh Postgres databases info to handle possible renames - let existing_dbs: Vec = get_existing_dbs(client)?; + let existing_dbs = get_existing_dbs(client)?; info!("cluster spec databases:"); for db in &spec.cluster.databases { let name = &db.name; - - // XXX: with a limited number of databases it is fine, but consider making it a HashMap - let pg_db = existing_dbs.iter().find(|r| r.name == *name); + let pg_db = existing_dbs.get(name); enum DatabaseAction { None, @@ -530,13 +559,32 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> { - info!("cluster spec grants:"); +pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> { + info!("modifying database permissions"); + let existing_dbs = get_existing_dbs(client)?; // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. for db in &spec.cluster.databases { + match existing_dbs.get(&db.name) { + Some(pg_db) => { + if pg_db.restrict_conn || pg_db.invalid { + info!( + "skipping grants for db {} (invalid: {}, connections not allowed: {})", + db.name, pg_db.invalid, pg_db.restrict_conn + ); + continue; + } + } + None => { + bail!( + "database {} doesn't exist in Postgres after handle_databases()", + db.name + ); + } + } + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); @@ -575,6 +623,11 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> { // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user. // This is needed because since postgres 15 this privilege is removed by default. + // TODO: web_access isn't created for almost 1 year. It could be that we have + // active users of 1 year old projects, but hopefully not, so check it and + // remove this code if possible. The worst thing that could happen is that + // user won't be able to use public schema in NEW databases created in the + // very OLD project. let grant_query = "DO $$\n\ BEGIN\n\ IF EXISTS(\n\ diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index b41ca8c9cf..cfbd50d38a 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -200,6 +200,12 @@ pub struct Database { pub name: PgIdent, pub owner: PgIdent, pub options: GenericOptions, + // These are derived flags, not present in the spec file. + // They are never set by the control plane. + #[serde(skip_deserializing, default)] + pub restrict_conn: bool, + #[serde(skip_deserializing, default)] + pub invalid: bool, } /// Common type representing both SQL statement params with or without value, diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 8b0035b8e8..debbbce117 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -741,13 +741,6 @@ NeonProcessUtility( break; case T_DropdbStmt: HandleDropDb(castNode(DropdbStmt, parseTree)); - /* - * We do this here to hack around the fact that Postgres performs the drop - * INSIDE of standard_ProcessUtility, which means that if we try to - * abort the drop normally it'll be too late. DROP DATABASE can't be inside - * of a transaction block anyway, so this should be fine to do. - */ - NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL); break; case T_CreateRoleStmt: HandleCreateRole(castNode(CreateRoleStmt, parseTree)); diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index d4cf1b4739..6bd09c7030 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type import psycopg2 import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import VanillaPostgres +from fixtures.neon_fixtures import NeonEnv, VanillaPostgres from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -205,6 +205,10 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.wait() assert ddl.dbs == {"stork": "cork"} + cur.execute("DROP DATABASE stork") + ddl.wait() + assert ddl.dbs == {} + with pytest.raises(psycopg2.InternalError): ddl.failures(True) cur.execute("CREATE DATABASE failure WITH OWNER=cork") @@ -217,6 +221,94 @@ def test_ddl_forwarding(ddl: DdlForwardingContext): ddl.failures(True) cur.execute("DROP DATABASE failure") ddl.wait() - ddl.pg.connect(dbname="failure") # Ensure we can connect after a failed drop + assert ddl.dbs == {"failure": "cork"} + ddl.failures(False) + + # Check that db is still in the Postgres after failure + cur.execute("SELECT datconnlimit FROM pg_database WHERE datname = 'failure'") + result = cur.fetchone() + if not result: + raise AssertionError("Database 'failure' not found") + # -2 means invalid database + # It should be invalid because cplane request failed + assert result[0] == -2, "Database 'failure' is not invalid" + + # Check that repeated drop succeeds + cur.execute("DROP DATABASE failure") + ddl.wait() + assert ddl.dbs == {} + + # DB should be absent in the Postgres + cur.execute("SELECT count(*) FROM pg_database WHERE datname = 'failure'") + result = cur.fetchone() + if not result: + raise AssertionError("Could not count databases") + assert result[0] == 0, "Database 'failure' still exists after drop" conn.close() + + +# Assert that specified database has a specific connlimit, throwing an AssertionError otherwise +# -2 means invalid database +# -1 means no specific per-db limit (default) +def assert_db_connlimit(endpoint: Any, db_name: str, connlimit: int, msg: str): + with endpoint.cursor() as cur: + cur.execute("SELECT datconnlimit FROM pg_database WHERE datname = %s", (db_name,)) + result = cur.fetchone() + if not result: + raise AssertionError(f"Database '{db_name}' not found") + assert result[0] == connlimit, msg + + +# Test that compute_ctl can deal with invalid databases (drop them). +# If Postgres extension cannot reach cplane, then DROP will be aborted +# and database will be marked as invalid. Then there are two recovery +# flows: +# 1. User can just repeat DROP DATABASE command until it succeeds +# 2. User can ignore, then compute_ctl will drop invalid databases +# automatically during full configuration +# Here we test the latter. The first one is tested in test_ddl_forwarding +def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_ddl_forwarding_invalid_db", "empty") + endpoint = env.endpoints.create_start( + "test_ddl_forwarding_invalid_db", + # Some non-existent url + config_lines=["neon.console_url=http://localhost:9999/unknown/api/v0/roles_and_databases"], + ) + log.info("postgres is running on 'test_ddl_forwarding_invalid_db' branch") + + with endpoint.cursor() as cur: + cur.execute("SET neon.forward_ddl = false") + cur.execute("CREATE DATABASE failure") + cur.execute("COMMIT") + + assert_db_connlimit( + endpoint, "failure", -1, "Database 'failure' doesn't have a valid connlimit" + ) + + with pytest.raises(psycopg2.InternalError): + with endpoint.cursor() as cur: + cur.execute("DROP DATABASE failure") + cur.execute("COMMIT") + + # Should be invalid after failed drop + assert_db_connlimit(endpoint, "failure", -2, "Database 'failure' ins't invalid") + + endpoint.stop() + endpoint.start() + + # Still invalid after restart without full configuration + assert_db_connlimit(endpoint, "failure", -2, "Database 'failure' ins't invalid") + + endpoint.stop() + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + + # Should be cleaned up by compute_ctl during full configuration + with endpoint.cursor() as cur: + cur.execute("SELECT count(*) FROM pg_database WHERE datname = 'failure'") + result = cur.fetchone() + if not result: + raise AssertionError("Could not count databases") + assert result[0] == 0, "Database 'failure' still exists after restart" From 3666df6342b59efa235cba994aaaba19ec85e71b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 16 Oct 2023 20:52:07 +0200 Subject: [PATCH 17/18] azure_blob.rs: use division instead of left shift (#5572) Should have been a right shift but I did a left shift. It's constant folded anyways so we just use a shift. --- libs/remote_storage/src/azure_blob.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index ac2664b81a..4f7c8ce6f4 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -341,7 +341,7 @@ impl RemoteStorage for AzureBlobStorage { // to still be representable by signed 64 bit integers). // TODO remove workaround once the SDK adds open range support // https://github.com/Azure/azure-sdk-for-rust/issues/1438 - let end_exclusive = u64::MAX << 4; + let end_exclusive = u64::MAX / 4; builder = builder.range(Range::new(start_inclusive, end_exclusive)); } From a6b2f4e54ef97333c35834788bf555fc10e3f4bb Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 17 Oct 2023 11:29:48 +0200 Subject: [PATCH 18/18] limit imitate accesses concurrency, using same semaphore as compactions (#5578) Before this PR, when we restarted pageserver, we'd see a rush of `$number_of_tenants` concurrent eviction tasks starting to do imitate accesses building up in the period of `[init_order allows activations, $random_access_delay + EvictionPolicyLayerAccessThreshold::period]`. We simply cannot handle that degree of concurrent IO. We already solved the problem for compactions by adding a semaphore. So, this PR shares that semaphore for use by evictions. Part of https://github.com/neondatabase/neon/issues/5479 Which is again part of https://github.com/neondatabase/neon/issues/4743 Risks / Changes In System Behavior ================================== * we don't do evictions as timely as we currently do * we log a bunch of warnings about eviction taking too long * imitate accesses and compactions compete for the same concurrency limit, so, they'll slow each other down through this shares semaphore Changes ======= - Move the `CONCURRENT_COMPACTIONS` semaphore into `tasks.rs` - Rename it to `CONCURRENT_BACKGROUND_TASKS` - Use it also for the eviction imitate accesses: - Imitate acceses are both per-TIMELINE and per-TENANT - The per-TENANT is done through coalescing all the per-TIMELINE tasks via a tokio mutex `eviction_task_tenant_state`. - We acquire the CONCURRENT_BACKGROUND_TASKS permit early, at the beginning of the eviction iteration, much before the imitate acesses start (and they may not even start at all in the given iteration, as they happen only every $threshold). - Acquiring early is **sub-optimal** because when the per-timline tasks coalesce on the `eviction_task_tenant_state` mutex, they are already holding a CONCURRENT_BACKGROUND_TASKS permit. - It's also unfair because tenants with many timelines win the CONCURRENT_BACKGROUND_TASKS more often. - I don't think there's another way though, without refactoring more of the imitate accesses logic, e.g, making it all per-tenant. - Add metrics for queue depth behind the semaphore. I found these very useful to understand what work is queued in the system. - The metrics are tagged by the new `BackgroundLoopKind`. - On a green slate, I would have used `TaskKind`, but we already had pre-existing labels whose names didn't map exactly to task kind. Also the task kind is kind of a lower-level detail, so, I think it's fine to have a separate enum to identify background work kinds. Future Work =========== I guess I could move the eviction tasks from a ticker to "sleep for $period". The benefit would be that the semaphore automatically "smears" the eviction task scheduling over time, so, we only have the rush on restart but a smeared-out rush afterward. The downside is that this perverts the meaning of "$period", as we'd actually not run the eviction at a fixed period. It also means the the "took to long" warning & metric becomes meaningless. Then again, that is already the case for the compaction and gc tasks, which do sleep for `$period` instead of using a ticker. (cherry picked from commit 9256788273d5661ced0b2661a8751e2aa86fbb59) --- pageserver/src/consumption_metrics.rs | 10 ++- pageserver/src/metrics.rs | 20 +++++ pageserver/src/tenant/tasks.rs | 81 +++++++++++++++++-- pageserver/src/tenant/timeline.rs | 39 +++------ .../src/tenant/timeline/eviction_task.rs | 18 ++++- 5 files changed, 131 insertions(+), 37 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 72a2099d92..13f7977946 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -2,6 +2,7 @@ //! and push them to a HTTP endpoint. use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr, LogicalSizeCalculationCause}; use camino::Utf8PathBuf; use consumption_metrics::EventType; @@ -143,7 +144,7 @@ pub async fn collect_metrics( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), metric_collection_interval, - "consumption_metrics_collect_metrics", + BackgroundLoopKind::ConsumptionMetricsCollectMetrics, ); } } @@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker( } if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await { + // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? + // We can put in some prioritization for consumption metrics. + // Same for the loop that fetches computed metrics. + // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, + // which turns out is really handy to understand the system. if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await { error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); } @@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker( crate::tenant::tasks::warn_when_period_overrun( tick_at.elapsed(), synthetic_size_calculation_interval, - "consumption_metrics_synthetic_size_worker", + BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker, ); } } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c154b4a4ca..eea3de0583 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1067,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_start_count", + "Counter for background loop concurrency-limiting semaphore acquire calls started", + &["task"], + ) + .unwrap() + }); + +pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy = + Lazy::new(|| { + register_int_counter_vec!( + "pageserver_background_loop_semaphore_wait_finish_count", + "Counter for background loop concurrency-limiting semaphore acquire calls finished", + &["task"], + ) + .unwrap() + }); + pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_background_loop_period_overrun_count", diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index df3ffd08d3..00c8ced68a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::completion; +static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| { + let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; + let permits = usize::max( + 1, + // while a lot of the work is done on spawn_blocking, we still do + // repartitioning in the async context. this should give leave us some workers + // unblocked to be blocked on other work, hopefully easing any outside visible + // effects of restarts. + // + // 6/8 is a guess; previously we ran with unlimited 8 and more from + // spawn_blocking. + (total_threads * 3).checked_div(4).unwrap_or(0), + ); + assert_ne!(permits, 0, "we will not be adding in permits later"); + assert!( + permits < total_threads, + "need threads avail for shorter work" + ); + tokio::sync::Semaphore::new(permits) + }); + +#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)] +#[strum(serialize_all = "snake_case")] +pub(crate) enum BackgroundLoopKind { + Compaction, + Gc, + Eviction, + ConsumptionMetricsCollectMetrics, + ConsumptionMetricsSyntheticSizeWorker, +} + +impl BackgroundLoopKind { + fn as_static_str(&self) -> &'static str { + let s: &'static str = self.into(); + s + } +} + +pub(crate) enum RateLimitError { + Cancelled, +} + +pub(crate) async fn concurrent_background_tasks_rate_limit( + loop_kind: BackgroundLoopKind, + _ctx: &RequestContext, + cancel: &CancellationToken, +) -> Result { + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT + .with_label_values(&[loop_kind.as_static_str()]) + .inc(); + scopeguard::defer!( + crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc(); + ); + tokio::select! { + permit = CONCURRENT_BACKGROUND_TASKS.acquire() => { + match permit { + Ok(permit) => Ok(permit), + Err(_closed) => unreachable!("we never close the semaphore"), + } + }, + _ = cancel.cancelled() => { + Err(RateLimitError::Cancelled) + } + } +} + /// Start per tenant background loops: compaction and gc. pub fn start_background_loops( tenant: &Arc, @@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "compaction"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } }; - warn_when_period_overrun(started_at.elapsed(), period, "gc"); + warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) @@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay( } /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. -pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { +pub(crate) fn warn_when_period_overrun( + elapsed: Duration, + period: Duration, + task: BackgroundLoopKind, +) { // Duration::ZERO will happen because it's the "disable [bgtask]" value. if elapsed >= period && period != Duration::ZERO { // humantime does no significant digits clamping whereas Duration's debug is a bit more @@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task warn!( ?elapsed, period = %humantime::format_duration(period), - task, + ?task, "task iteration took longer than the configured period" ); crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT - .with_label_values(&[task, &format!("{}", period.as_secs())]) + .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())]) .inc(); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 537f776176..a6f92646a9 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::{ DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer, }; +use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -684,37 +685,17 @@ impl Timeline { ) -> anyhow::Result<()> { const ROUNDS: usize = 2; - static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| { - let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS; - let permits = usize::max( - 1, - // while a lot of the work is done on spawn_blocking, we still do - // repartitioning in the async context. this should give leave us some workers - // unblocked to be blocked on other work, hopefully easing any outside visible - // effects of restarts. - // - // 6/8 is a guess; previously we ran with unlimited 8 and more from - // spawn_blocking. - (total_threads * 3).checked_div(4).unwrap_or(0), - ); - assert_ne!(permits, 0, "we will not be adding in permits later"); - assert!( - permits < total_threads, - "need threads avail for shorter work" - ); - tokio::sync::Semaphore::new(permits) - }); - // this wait probably never needs any "long time spent" logging, because we already nag if // compaction task goes over it's period (20s) which is quite often in production. - let _permit = tokio::select! { - permit = CONCURRENT_COMPACTIONS.acquire() => { - permit - }, - _ = cancel.cancelled() => { - return Ok(()); - } + let _permit = match super::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Compaction, + ctx, + cancel, + ) + .await + { + Ok(permit) => permit, + Err(RateLimitError::Cancelled) => return Ok(()), }; let last_record_lsn = self.get_last_record_lsn(); diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 9bf31d85d4..38da993deb 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -30,6 +30,7 @@ use crate::{ tenant::{ config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold}, storage_layer::PersistentLayer, + tasks::{BackgroundLoopKind, RateLimitError}, timeline::EvictionError, LogicalSizeCalculationCause, Tenant, }, @@ -129,7 +130,11 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); + crate::tenant::tasks::warn_when_period_overrun( + elapsed, + p.period, + BackgroundLoopKind::Eviction, + ); crate::metrics::EVICTION_ITERATION_DURATION .get_metric_with_label_values(&[ &format!("{}", p.period.as_secs()), @@ -150,6 +155,17 @@ impl Timeline { ) -> ControlFlow<()> { let now = SystemTime::now(); + let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit( + BackgroundLoopKind::Eviction, + ctx, + cancel, + ) + .await + { + Ok(permit) => permit, + Err(RateLimitError::Cancelled) => return ControlFlow::Break(()), + }; + // If we evict layers but keep cached values derived from those layers, then // we face a storm of on-demand downloads after pageserver restart. // The reason is that the restart empties the caches, and so, the values