From d696c41807306333dab3568da523937963f7a116 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 30 Sep 2024 09:20:52 +0300 Subject: [PATCH 01/27] Bump default neon extension version to 1.5 (#9188) Commit 263dfba6ee introduced neon extension version 1.5, which included some new functions and views for metrics. It didn't bump the default neon extension number yet, so that we could still safely roll back to the old binary if necessary. This bumps the default version. --- pgxn/neon/neon.control | 4 +--- test_runner/regress/test_neon_extension.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control index 0b36bdbb65..af69116e21 100644 --- a/pgxn/neon/neon.control +++ b/pgxn/neon/neon.control @@ -1,8 +1,6 @@ # neon extension comment = 'cloud storage for PostgreSQL' -# TODO: bump default version to 1.5, after we are certain that we don't -# need to rollback the compute image -default_version = '1.4' +default_version = '1.5' module_pathname = '$libdir/neon' relocatable = true trusted = true diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 22a6013225..619fd83c9b 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -24,7 +24,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.5",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") res = cur.fetchall() log.info(res) @@ -48,7 +48,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # IMPORTANT: # If the version has changed, the test should be updated. # Ensure that the default version is also updated in the neon.control file - assert cur.fetchone() == ("1.4",) + assert cur.fetchone() == ("1.5",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") all_versions = ["1.5", "1.4", "1.3", "1.2", "1.1", "1.0"] current_version = "1.5" From 7cfd116856d07ab34a63be6c221cc7f9ffd92a58 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 30 Sep 2024 09:27:28 +0100 Subject: [PATCH 02/27] pageserver: refactor immediate_gc into TenantManager (#9183) ## Problem Legacy functions that were called as `mgr::` and relied on the static TENANTS, see #5796 ## Summary of changes - Move the last stray function (immediate_gc) into TenantManager Closes: https://github.com/neondatabase/neon/issues/5796 --- pageserver/src/http/routes.rs | 9 +- pageserver/src/tenant/mgr.rs | 155 +++++++++++++++++----------------- 2 files changed, 85 insertions(+), 79 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6f0402e7b0..1cc5502bd6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -56,6 +56,7 @@ use utils::http::endpoint::request_span; use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; +use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; use crate::pgdatadir_mapping::LsnForTimestamp; @@ -80,7 +81,6 @@ use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; -use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, @@ -1719,8 +1719,13 @@ async fn timeline_gc_handler( let gc_req: TimelineGcRequest = json_request(&mut request).await?; + let state = get_state(&request); + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; + let gc_result = state + .tenant_manager + .immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx) + .await?; json_response(StatusCode::OK, gc_result) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index c7212e89ba..9d9852c525 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2197,6 +2197,82 @@ impl TenantManager { Ok((wanted_bytes, shard_count as u32)) } + + #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] + pub(crate) async fn immediate_gc( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + gc_req: TimelineGcRequest, + cancel: CancellationToken, + ctx: &RequestContext, + ) -> Result { + let tenant = { + let guard = self.tenants.read().unwrap(); + guard + .get(&tenant_shard_id) + .cloned() + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))? + }; + + let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); + // Use tenant's pitr setting + let pitr = tenant.get_pitr_interval(); + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + // Run in task_mgr to avoid race with tenant_detach operation + let ctx: RequestContext = + ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + + let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; + + fail::fail_point!("immediate_gc_task_pre"); + + #[allow(unused_mut)] + let mut result = tenant + .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .await; + // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it + // better once the types support it. + + #[cfg(feature = "testing")] + { + // we need to synchronize with drop completion for python tests without polling for + // log messages + if let Ok(result) = result.as_mut() { + let mut js = tokio::task::JoinSet::new(); + for layer in std::mem::take(&mut result.doomed_layers) { + js.spawn(layer.wait_drop()); + } + tracing::info!( + total = js.len(), + "starting to wait for the gc'd layers to be dropped" + ); + while let Some(res) = js.join_next().await { + res.expect("wait_drop should not panic"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false).ok(); + let rtc = timeline.as_ref().map(|x| &x.remote_client); + + if let Some(rtc) = rtc { + // layer drops schedule actions on remote timeline client to actually do the + // deletions; don't care about the shutdown error, just exit fast + drop(rtc.wait_completion().await); + } + } + + result.map_err(|e| match e { + GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, + GcError::TimelineNotFound => { + ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) + } + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + }) + } } #[derive(Debug, thiserror::Error)] @@ -2341,7 +2417,7 @@ enum TenantSlotDropError { /// Errors that can happen any time we are walking the tenant map to try and acquire /// the TenantSlot for a particular tenant. #[derive(Debug, thiserror::Error)] -pub enum TenantMapError { +pub(crate) enum TenantMapError { // Tried to read while initializing #[error("tenant map is still initializing")] StillInitializing, @@ -2371,7 +2447,7 @@ pub enum TenantMapError { /// The `old_value` may be dropped before the SlotGuard is dropped, by calling /// `drop_old_value`. It is an error to call this without shutting down /// the conents of `old_value`. -pub struct SlotGuard { +pub(crate) struct SlotGuard { tenant_shard_id: TenantShardId, old_value: Option, upserted: bool, @@ -2764,81 +2840,6 @@ use { utils::http::error::ApiError, }; -#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))] -pub(crate) async fn immediate_gc( - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - gc_req: TimelineGcRequest, - cancel: CancellationToken, - ctx: &RequestContext, -) -> Result { - let tenant = { - let guard = TENANTS.read().unwrap(); - guard - .get(&tenant_shard_id) - .cloned() - .with_context(|| format!("tenant {tenant_shard_id}")) - .map_err(|e| ApiError::NotFound(e.into()))? - }; - - let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); - // Use tenant's pitr setting - let pitr = tenant.get_pitr_interval(); - - tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; - - // Run in task_mgr to avoid race with tenant_detach operation - let ctx: RequestContext = - ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download); - - let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?; - - fail::fail_point!("immediate_gc_task_pre"); - - #[allow(unused_mut)] - let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) - .await; - // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it - // better once the types support it. - - #[cfg(feature = "testing")] - { - // we need to synchronize with drop completion for python tests without polling for - // log messages - if let Ok(result) = result.as_mut() { - let mut js = tokio::task::JoinSet::new(); - for layer in std::mem::take(&mut result.doomed_layers) { - js.spawn(layer.wait_drop()); - } - tracing::info!( - total = js.len(), - "starting to wait for the gc'd layers to be dropped" - ); - while let Some(res) = js.join_next().await { - res.expect("wait_drop should not panic"); - } - } - - let timeline = tenant.get_timeline(timeline_id, false).ok(); - let rtc = timeline.as_ref().map(|x| &x.remote_client); - - if let Some(rtc) = rtc { - // layer drops schedule actions on remote timeline client to actually do the - // deletions; don't care about the shutdown error, just exit fast - drop(rtc.wait_completion().await); - } - } - - result.map_err(|e| match e { - GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown, - GcError::TimelineNotFound => { - ApiError::NotFound(anyhow::anyhow!("Timeline not found").into()) - } - other => ApiError::InternalServerError(anyhow::anyhow!(other)), - }) -} - #[cfg(test)] mod tests { use std::collections::BTreeMap; From 5dc68e4e6ae13a486113af001af4445f6adb4f93 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:37:14 +0200 Subject: [PATCH 03/27] test_compatibility: fix the regexes detecting the version (#9205) ## Problem The Neon components, built locally and by the GitHub workflow have slightly different version prefixes (git: vs git-env:) This does not allow running tests against local builds correctly. ## Summary of changes The regular expressions were changed to work with both prefixes. --- test_runner/regress/test_compatibility.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index b559be5f18..0669105625 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -252,7 +252,7 @@ def test_forward_compatibility( # not using env.pageserver.version because it was initialized before prev_pageserver_version_str = env.get_binary_version("pageserver") prev_pageserver_version_match = re.search( - "Neon page server git-env:(.*) failpoints: (.*), features: (.*)", + "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)", prev_pageserver_version_str, ) if prev_pageserver_version_match is not None: @@ -263,12 +263,12 @@ def test_forward_compatibility( ) # does not include logs from previous runs - assert not env.pageserver.log_contains("git-env:" + prev_pageserver_version) + assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}") env.start() # ensure the specified pageserver is running - assert env.pageserver.log_contains("git-env:" + prev_pageserver_version) + assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}") check_neon_works( env, From 7424e7269cf31066e8cedd220cee543e6b19f14f Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 30 Sep 2024 15:46:07 +0100 Subject: [PATCH 04/27] tests: longer timeout in `test_delete_timeline_client_hangup` (#9161) ## Problem This test waits for a request to finish, and then expects deletion to complete almost immediately. The request completes, but it's a 202, the timeline is still deleting in the background: we need to be more patient. ## Summary of changes - Adjust iterations from 2 to 10 when waiting for deletion --- test_runner/regress/test_timeline_delete.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 711fcd5016..edb32cd2b4 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -638,7 +638,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): wait_until(50, 0.1, first_request_finished) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10) def test_timeline_delete_works_for_remote_smoke( From 4dc9cb7cf9d12a52bc3e664f39c559e746cc93f6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 30 Sep 2024 17:56:37 +0300 Subject: [PATCH 05/27] tests: Remove some spurious list_timelines calls These calls seem really out of place. We know what the initial tenant and branch are in these tests, just like in all other tests. --- test_runner/regress/test_neon_cli.py | 4 +--- test_runner/regress/test_tenant_size.py | 17 +++++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index b65430ff49..96543f1ef5 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -31,9 +31,7 @@ def helper_compare_timeline_list( ) ) - timelines_cli = env.neon_cli.list_timelines() - assert timelines_cli == env.neon_cli.list_timelines(initial_tenant) - + timelines_cli = env.neon_cli.list_timelines(initial_tenant) cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) assert timelines_api == cli_timeline_ids diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index f872116a1c..609987ab0c 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -27,20 +27,15 @@ def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) # we should never have zero, because there should be the initdb "changes" assert initial_size > 0, "initial implementation returns ~initdb tenant_size" - main_branch_name = "main" - - branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] - assert branch_name == main_branch_name - endpoint = env.endpoints.create_start( - main_branch_name, + "main", tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], ) @@ -54,7 +49,7 @@ def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): # The transaction above will make the compute generate a checkpoint. # In turn, the pageserver persists the checkpoint. This should only be # one key with a size of a couple hundred bytes. - wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) size = http_client.tenant_size(tenant_id) assert size >= initial_size and size - initial_size < 1024 @@ -306,7 +301,8 @@ def test_single_branch_get_tenant_size_grows( env = neon_env_builder.init_start(initial_tenant_conf=tenant_config) tenant_id = env.initial_tenant - branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + timeline_id = env.initial_timeline + branch_name = "main" http_client = env.pageserver.http_client() @@ -516,7 +512,8 @@ def test_get_tenant_size_with_multiple_branches( env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*") tenant_id = env.initial_tenant - main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + main_timeline_id = env.initial_timeline + main_branch_name = "main" http_client = env.pageserver.http_client() From 69ea2776e92ef39a5f6453b6df42ab9c3aa2f957 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 30 Sep 2024 17:56:40 +0300 Subject: [PATCH 06/27] tests: Remove creation of extra timelines in some tests neon_cli.create_tenant() creates a new tenant *and* a timeline on the tenant, with name "main". In most tests, there's no need to create another timeline on the same tenant. There are some more tests that do that, but in the remaining cases, I wasn't be 100% if the presence of extra root timelines affect what the tests test, so I left them alone. --- test_runner/regress/test_branch_and_gc.py | 7 +++---- test_runner/regress/test_pageserver_restart.py | 3 +-- test_runner/regress/test_tenants_with_remote_storage.py | 3 +-- test_runner/regress/test_truncate.py | 3 +-- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index d7c4cf059a..43140c05ff 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -53,7 +53,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - tenant, _ = env.neon_cli.create_tenant( + tenant, timeline_main = env.neon_cli.create_tenant( conf={ # disable background GC "gc_period": "0s", @@ -70,8 +70,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): } ) - timeline_main = env.neon_cli.create_timeline("test_main", tenant_id=tenant) - endpoint_main = env.endpoints.create_start("test_main", tenant_id=tenant) + endpoint_main = env.endpoints.create_start("main", tenant_id=tenant) main_cur = endpoint_main.connect().cursor() @@ -92,7 +91,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) env.neon_cli.create_branch( - "test_branch", "test_main", tenant_id=tenant, ancestor_start_lsn=lsn1 + "test_branch", ancestor_branch_name="main", ancestor_start_lsn=lsn1, tenant_id=tenant ) endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index bbf82fea4c..bd47a30428 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -174,8 +174,7 @@ def test_pageserver_chaos( "checkpoint_distance": "5000000", } ) - env.neon_cli.create_timeline("test_pageserver_chaos", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_pageserver_chaos", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 168876b711..6ecc903192 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -71,10 +71,9 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder): "checkpoint_distance": "5000000", } ) - env.neon_cli.create_timeline("test_tenants_many", tenant_id=tenant) endpoint = env.endpoints.create_start( - "test_tenants_many", + "main", tenant_id=tenant, ) tenants_endpoints.append((tenant, endpoint)) diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index 52f125ce0b..bfa9ce5db7 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -26,8 +26,7 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): } ) - env.neon_cli.create_timeline("test_truncate", tenant_id=tenant) - endpoint = env.endpoints.create_start("test_truncate", tenant_id=tenant) + endpoint = env.endpoints.create_start("main", tenant_id=tenant) cur = endpoint.connect().cursor() cur.execute("create table t1(x integer)") cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") From 0a567acdb9152f781d736a3edfe42902f93d2b91 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 30 Sep 2024 17:56:43 +0300 Subject: [PATCH 07/27] tests: Move comment to more appropriate place There is no 'pg_bin' in NeonEnv. --- test_runner/fixtures/neon_fixtures.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 70a038c960..f5019e39dc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -950,9 +950,6 @@ class NeonEnv: safekeepers - An array containing objects representing the safekeepers - pg_bin - pg_bin.run() can be used to execute Postgres client binaries, - like psql or pg_dump - initial_tenant - tenant ID of the initial tenant created in the repository neon_cli - can be used to run the 'neon' CLI tool @@ -3300,6 +3297,8 @@ class PgBin: @pytest.fixture(scope="function") def pg_bin(test_output_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion) -> PgBin: + """pg_bin.run() can be used to execute Postgres client binaries, like psql or pg_dump""" + return PgBin(test_output_dir, pg_distrib_dir, pg_version) From a2e2362ee9bcc1a4ccabae6be9a4d159f40d1681 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 30 Sep 2024 19:11:50 +0100 Subject: [PATCH 08/27] add proxy-protocol header disable option (#9203) resolves https://github.com/neondatabase/cloud/issues/18026 --- proxy/src/bin/local_proxy.rs | 2 +- proxy/src/bin/proxy.rs | 11 +++++++---- proxy/src/config.rs | 13 ++++++++++++- proxy/src/proxy.rs | 11 ++++++++--- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 1b3f465686..a7bdac910f 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -274,7 +274,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, }, - require_client_ip: false, + proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), region: "local".into(), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 141005788d..50d204fca6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -17,6 +17,7 @@ use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; use proxy::config::ProjectInfoCacheOptions; +use proxy::config::ProxyProtocolV2; use proxy::console; use proxy::context::parquet::ParquetUploadArgs; use proxy::http; @@ -144,9 +145,6 @@ struct ProxyCliArgs { /// size of the threadpool for password hashing #[clap(long, default_value_t = 4)] scram_thread_pool_size: u8, - /// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated. - #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - require_client_ip: bool, /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] disable_dynamic_rate_limiter: bool, @@ -229,6 +227,11 @@ struct ProxyCliArgs { /// Configure if this is a private access proxy for the POC: In that case the proxy will ignore the IP allowlist #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] is_private_access_proxy: bool, + + /// Configure whether all incoming requests have a Proxy Protocol V2 packet. + // TODO(conradludgate): switch default to rejected or required once we've updated all deployments + #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] + proxy_protocol_v2: ProxyProtocolV2, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -704,7 +707,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { allow_self_signed_compute: args.allow_self_signed_compute, http_config, authentication_config, - require_client_ip: args.require_client_ip, + proxy_protocol_v2: args.proxy_protocol_v2, handshake_timeout: args.handshake_timeout, region: args.region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 373e4cf650..a66d4773a3 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -7,6 +7,7 @@ use crate::{ Host, }; use anyhow::{bail, ensure, Context, Ok}; +use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; use rustls::{ @@ -30,7 +31,7 @@ pub struct ProxyConfig { pub allow_self_signed_compute: bool, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, - pub require_client_ip: bool, + pub proxy_protocol_v2: ProxyProtocolV2, pub region: String, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, @@ -38,6 +39,16 @@ pub struct ProxyConfig { pub connect_to_compute_retry_config: RetryConfig, } +#[derive(Copy, Clone, Debug, ValueEnum, PartialEq)] +pub enum ProxyProtocolV2 { + /// Connection will error if PROXY protocol v2 header is missing + Required, + /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing. + Supported, + /// Connection will error if PROXY protocol v2 header is provided + Rejected, +} + #[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index ff199ac701..7003af2aba 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -10,6 +10,7 @@ pub(crate) mod wake_compute; pub use copy_bidirectional::copy_bidirectional_client_compute; pub use copy_bidirectional::ErrorSource; +use crate::config::ProxyProtocolV2; use crate::{ auth, cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, @@ -93,15 +94,19 @@ pub async fn task_main( connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await { - Ok((socket, Some(addr))) => (socket, addr.ip()), Err(e) => { error!("per-client task finished with an error: {e:#}"); return; } - Ok((_socket, None)) if config.require_client_ip => { - error!("missing required client IP"); + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); return; } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), Ok((socket, None)) => (socket, peer_addr.ip()), }; From c07cea80bd7a462c108ad04e72fc9af60f31f82b Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Mon, 30 Sep 2024 19:18:42 +0100 Subject: [PATCH 09/27] Bump vm-builder v0.29.3 -> v0.35.0 (#9208) We haven't updated it for a while. Now I need the update to add quotas support to compute images (https://github.com/neondatabase/cloud/issues/13127). Previous update: https://github.com/neondatabase/neon/pull/7849 --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ba5d139553..8cb49d5d76 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -773,7 +773,7 @@ jobs: matrix: version: [ v14, v15, v16, v17 ] env: - VM_BUILDER_VERSION: v0.29.3 + VM_BUILDER_VERSION: v0.35.0 steps: - uses: actions/checkout@v4 From 94a5ca281702e1b94c4a6320d2b52fdb5eb71966 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 30 Sep 2024 20:43:45 +0100 Subject: [PATCH 10/27] proxy: auth broker (#8855) Opens http2 connection to local-proxy and forwards requests over with all headers and body closes https://github.com/neondatabase/cloud/issues/16039 --- proxy/src/auth/backend.rs | 5 +- proxy/src/auth/backend/jwt.rs | 74 +++++- proxy/src/auth/backend/local.rs | 4 +- proxy/src/bin/local_proxy.rs | 8 +- proxy/src/bin/proxy.rs | 57 +++-- proxy/src/config.rs | 38 ++- proxy/src/serverless.rs | 31 ++- proxy/src/serverless/backend.rs | 236 +++++++++++++++-- proxy/src/serverless/http_conn_pool.rs | 342 +++++++++++++++++++++++++ proxy/src/serverless/http_util.rs | 21 +- proxy/src/serverless/sql_over_http.rs | 176 +++++++++++-- 11 files changed, 894 insertions(+), 98 deletions(-) create mode 100644 proxy/src/serverless/http_conn_pool.rs diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 5dbfa5cc09..52ddfd90fb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -565,7 +565,7 @@ mod tests { stream::{PqStream, Stream}, }; - use super::{auth_quirks, AuthRateLimiter}; + use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; struct Auth { ips: Vec, @@ -611,12 +611,15 @@ mod tests { } static CONFIG: Lazy = Lazy::new(|| AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool: ThreadPool::new(1), scram_protocol_timeout: std::time::Duration::from_secs(5), rate_limiter_enabled: true, rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_auth_broker: false, + accept_jwts: false, }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index ab848551a9..38dd30ce92 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -8,7 +8,7 @@ use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{Deserialize, Deserializer}; +use serde::{de::Visitor, Deserialize, Deserializer}; use signature::Verifier; use tokio::time::Instant; @@ -311,13 +311,11 @@ impl JwkCacheEntryLock { tracing::debug!(?payload, "JWT signature valid with claims"); - match (expected_audience, payload.audience) { - // check the audience matches - (Some(aud1), Some(aud2)) => ensure!(aud1 == aud2, "invalid JWT token audience"), - // the audience is expected but is missing - (Some(_), None) => bail!("invalid JWT token audience"), - // we don't care for the audience field - (None, _) => {} + if let Some(aud) = expected_audience { + ensure!( + payload.audience.0.iter().any(|s| s == aud), + "invalid JWT token audience" + ); } let now = SystemTime::now(); @@ -420,11 +418,12 @@ struct JwtHeader<'a> { } /// -#[derive(serde::Deserialize, serde::Serialize, Debug)] +#[derive(serde::Deserialize, Debug)] +#[allow(dead_code)] struct JwtPayload<'a> { /// Audience - Recipient for which the JWT is intended - #[serde(rename = "aud")] - audience: Option<&'a str>, + #[serde(rename = "aud", default)] + audience: OneOrMany, /// Expiration - Time after which the JWT expires #[serde(deserialize_with = "numeric_date_opt", rename = "exp", default)] expiration: Option, @@ -447,6 +446,59 @@ struct JwtPayload<'a> { session_id: Option<&'a str>, } +/// `OneOrMany` supports parsing either a single item or an array of items. +/// +/// Needed for +/// +/// > The "aud" (audience) claim identifies the recipients that the JWT is +/// > intended for. Each principal intended to process the JWT MUST +/// > identify itself with a value in the audience claim. If the principal +/// > processing the claim does not identify itself with a value in the +/// > "aud" claim when this claim is present, then the JWT MUST be +/// > rejected. In the general case, the "aud" value is **an array of case- +/// > sensitive strings**, each containing a StringOrURI value. In the +/// > special case when the JWT has one audience, the "aud" value MAY be a +/// > **single case-sensitive string** containing a StringOrURI value. The +/// > interpretation of audience values is generally application specific. +/// > Use of this claim is OPTIONAL. +#[derive(Default, Debug)] +struct OneOrMany(Vec); + +impl<'de> Deserialize<'de> for OneOrMany { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct OneOrManyVisitor; + impl<'de> Visitor<'de> for OneOrManyVisitor { + type Value = OneOrMany; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a single string or an array of strings") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(OneOrMany(vec![v.to_owned()])) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut v = vec![]; + while let Some(s) = seq.next_element()? { + v.push(s); + } + Ok(OneOrMany(v)) + } + } + deserializer.deserialize_any(OneOrManyVisitor) + } +} + fn numeric_date_opt<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { let d = >::deserialize(d)?; Ok(d.map(|n| SystemTime::UNIX_EPOCH + Duration::from_secs(n))) diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 2ab53f2c6a..f56b0a0a6d 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -14,17 +14,15 @@ use crate::{ EndpointId, }; -use super::jwt::{AuthRule, FetchAuthRules, JwkCache}; +use super::jwt::{AuthRule, FetchAuthRules}; pub struct LocalBackend { - pub(crate) jwks_cache: JwkCache, pub(crate) node_info: NodeInfo, } impl LocalBackend { pub fn new(postgres_addr: SocketAddr) -> Self { LocalBackend { - jwks_cache: JwkCache::default(), node_info: NodeInfo { config: { let mut cfg = ConnCfg::new(); diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index a7bdac910f..49887576c7 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -6,7 +6,10 @@ use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; use proxy::{ - auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}, + auth::backend::{ + jwt::JwkCache, + local::{LocalBackend, JWKS_ROLE_MAP}, + }, cancellation::CancellationHandlerMain, config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, console::{ @@ -267,12 +270,15 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig allow_self_signed_compute: false, http_config, authentication_config: AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool: ThreadPool::new(0), scram_protocol_timeout: Duration::from_secs(10), rate_limiter_enabled: false, rate_limiter: BucketRateLimiter::new(vec![]), rate_limit_ip_subnet: 64, ip_allowlist_check_enabled: true, + is_auth_broker: false, + accept_jwts: true, }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 50d204fca6..fa4fb264f2 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -8,6 +8,7 @@ use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; use proxy::auth; +use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::AuthRateLimiter; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; @@ -103,6 +104,9 @@ struct ProxyCliArgs { default_value = "http://localhost:3000/authenticate_proxy_request/" )] auth_endpoint: String, + /// if this is not local proxy, this toggles whether we accept jwt or passwords for http + #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] + is_auth_broker: bool, /// path to TLS key for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir @@ -385,9 +389,27 @@ async fn main() -> anyhow::Result<()> { info!("Starting mgmt on {mgmt_address}"); let mgmt_listener = TcpListener::bind(mgmt_address).await?; - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); - let proxy_listener = TcpListener::bind(proxy_address).await?; + let proxy_listener = if !args.is_auth_broker { + let proxy_address: SocketAddr = args.proxy.parse()?; + info!("Starting proxy on {proxy_address}"); + + Some(TcpListener::bind(proxy_address).await?) + } else { + None + }; + + // TODO: rename the argument to something like serverless. + // It now covers more than just websockets, it also covers SQL over HTTP. + let serverless_listener = if let Some(serverless_address) = args.wss { + let serverless_address: SocketAddr = serverless_address.parse()?; + info!("Starting wss on {serverless_address}"); + Some(TcpListener::bind(serverless_address).await?) + } else if args.is_auth_broker { + bail!("wss arg must be present for auth-broker") + } else { + None + }; + let cancellation_token = CancellationToken::new(); let cancel_map = CancelMap::default(); @@ -433,21 +455,17 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - client_tasks.spawn(proxy::proxy::task_main( - config, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - - // TODO: rename the argument to something like serverless. - // It now covers more than just websockets, it also covers SQL over HTTP. - if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; - info!("Starting wss on {serverless_address}"); - let serverless_listener = TcpListener::bind(serverless_address).await?; + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + if let Some(serverless_listener) = serverless_listener { client_tasks.spawn(serverless::task_main( config, serverless_listener, @@ -677,7 +695,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { )?; let http_config = HttpConfig { - accept_websockets: true, + accept_websockets: !args.is_auth_broker, pool_options: GlobalConnPoolOptions { max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, @@ -692,12 +710,15 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, }; let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), thread_pool, scram_protocol_timeout: args.scram_protocol_timeout, rate_limiter_enabled: args.auth_rate_limit_enabled, rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, }; let config = Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index a66d4773a3..7d86ef4348 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,5 +1,8 @@ use crate::{ - auth::{self, backend::AuthRateLimiter}, + auth::{ + self, + backend::{jwt::JwkCache, AuthRateLimiter}, + }, console::locks::ApiLocks, rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, scram::threadpool::ThreadPool, @@ -78,6 +81,9 @@ pub struct AuthenticationConfig { pub rate_limiter: AuthRateLimiter, pub rate_limit_ip_subnet: u8, pub ip_allowlist_check_enabled: bool, + pub jwks_cache: JwkCache, + pub is_auth_broker: bool, + pub accept_jwts: bool, } impl TlsConfig { @@ -261,18 +267,26 @@ impl CertResolver { let common_name = pem.subject().to_string(); - // We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as - // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so - // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names - // and passed None instead, which blows up number of cases downstream code should handle. Proper coding - // here should better avoid Option for common_names, and do wildcard-based certificate selection instead - // of cutting off '*.' parts. - let common_name = if common_name.starts_with("CN=*.") { - common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + // We need to get the canonical name for this certificate so we can match them against any domain names + // seen within the proxy codebase. + // + // In scram-proxy we use wildcard certificates only, with the database endpoint as the wildcard subdomain, taken from SNI. + // We need to remove the wildcard prefix for the purposes of certificate selection. + // + // auth-broker does not use SNI and instead uses the Neon-Connection-String header. + // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String. + // + // Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string + // validation, so let's we can continue with any common-name + let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=apiauth.") { + s.to_string() + } else if let Some(s) = common_name.strip_prefix("CN=") { + s.to_string() } else { - common_name.strip_prefix("CN=").map(|s| s.to_string()) - } - .context("Failed to parse common name from certificate")?; + bail!("Failed to parse common name from certificate") + }; let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)); diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 84f98cb8ad..a7e3fa709b 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -5,6 +5,7 @@ mod backend; pub mod cancel_set; mod conn_pool; +mod http_conn_pool; mod http_util; mod json; mod sql_over_http; @@ -19,7 +20,8 @@ use anyhow::Context; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; -use http_body_util::Full; +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Empty}; use hyper1::body::Incoming; use hyper_util::rt::TokioExecutor; use hyper_util::server::conn::auto::Builder; @@ -81,7 +83,28 @@ pub async fn task_main( } }); + let http_conn_pool = http_conn_pool::GlobalConnPool::new(&config.http_config); + { + let http_conn_pool = Arc::clone(&http_conn_pool); + tokio::spawn(async move { + http_conn_pool.gc_worker(StdRng::from_entropy()).await; + }); + } + + // shutdown the connection pool + tokio::spawn({ + let cancellation_token = cancellation_token.clone(); + let http_conn_pool = http_conn_pool.clone(); + async move { + cancellation_token.cancelled().await; + tokio::task::spawn_blocking(move || http_conn_pool.shutdown()) + .await + .unwrap(); + } + }); + let backend = Arc::new(PoolingBackend { + http_conn_pool: Arc::clone(&http_conn_pool), pool: Arc::clone(&conn_pool), config, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), @@ -342,7 +365,7 @@ async fn request_handler( // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let host = request .headers() .get("host") @@ -386,7 +409,7 @@ async fn request_handler( ); // Return the response so the spawned future can continue. - Ok(response.map(|_: http_body_util::Empty| Full::new(Bytes::new()))) + Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { let ctx = RequestMonitoring::new( session_id, @@ -409,7 +432,7 @@ async fn request_handler( ) .header("Access-Control-Max-Age", "86400" /* 24 hours */) .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code - .body(Full::new(Bytes::new())) + .body(Empty::new().map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into())) } else { json_response(StatusCode::BAD_REQUEST, "query is not supported") diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 607eb0caf6..89eeec3e6f 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,6 +1,8 @@ -use std::{sync::Arc, time::Duration}; +use std::{io, sync::Arc, time::Duration}; use async_trait::async_trait; +use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use tokio::net::{lookup_host, TcpStream}; use tracing::{field::display, info}; use crate::{ @@ -27,9 +29,13 @@ use crate::{ Host, }; -use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::{ + conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, + http_conn_pool::{self, poll_http2_client}, +}; pub(crate) struct PoolingBackend { + pub(crate) http_conn_pool: Arc, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, pub(crate) endpoint_rate_limiter: Arc, @@ -103,32 +109,44 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, ctx: &RequestMonitoring, + config: &AuthenticationConfig, user_info: &ComputeUserInfo, - jwt: &str, - ) -> Result { + jwt: String, + ) -> Result<(), AuthError> { match &self.config.auth_backend { - crate::auth::Backend::Console(_, ()) => { - Err(AuthError::auth_failed("JWT login is not yet supported")) + crate::auth::Backend::Console(console, ()) => { + config + .jwks_cache + .check_jwt( + ctx, + user_info.endpoint.clone(), + &user_info.user, + &**console, + &jwt, + ) + .await + .map_err(|e| AuthError::auth_failed(e.to_string()))?; + + Ok(()) } crate::auth::Backend::Web(_, ()) => Err(AuthError::auth_failed( "JWT login over web auth proxy is not supported", )), - crate::auth::Backend::Local(cache) => { - cache + crate::auth::Backend::Local(_) => { + config .jwks_cache .check_jwt( ctx, user_info.endpoint.clone(), &user_info.user, &StaticAuthRules, - jwt, + &jwt, ) .await .map_err(|e| AuthError::auth_failed(e.to_string()))?; - Ok(ComputeCredentials { - info: user_info.clone(), - keys: crate::auth::backend::ComputeCredentialKeys::None, - }) + + // todo: rewrite JWT signature with key shared somehow between local proxy and postgres + Ok(()) } } } @@ -174,14 +192,55 @@ impl PoolingBackend { ) .await } + + // Wake up the destination if needed + #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)] + pub(crate) async fn connect_to_local_proxy( + &self, + ctx: &RequestMonitoring, + conn_info: ConnInfo, + ) -> Result { + info!("pool: looking for an existing connection"); + if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) { + return Ok(client); + } + + let conn_id = uuid::Uuid::new_v4(); + tracing::Span::current().record("conn_id", display(conn_id)); + info!(%conn_id, "pool: opening a new connection '{conn_info}'"); + let backend = self + .config + .auth_backend + .as_ref() + .map(|()| ComputeCredentials { + info: conn_info.user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + }); + crate::proxy::connect_compute::connect_to_compute( + ctx, + &HyperMechanism { + conn_id, + conn_info, + pool: self.http_conn_pool.clone(), + locks: &self.config.connect_compute_locks, + }, + &backend, + false, // do not allow self signed compute for http flow + self.config.wake_compute_retry_config, + self.config.connect_to_compute_retry_config, + ) + .await + } } #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), - #[error("could not connection to compute")] - ConnectionError(#[from] tokio_postgres::Error), + #[error("could not connection to postgres in compute")] + PostgresConnectionError(#[from] tokio_postgres::Error), + #[error("could not connection to local-proxy in compute")] + LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not get auth info")] GetAuthInfo(#[from] GetAuthInfoError), @@ -193,11 +252,20 @@ pub(crate) enum HttpConnError { TooManyConnectionAttempts(#[from] ApiLockError), } +#[derive(Debug, thiserror::Error)] +pub(crate) enum LocalProxyConnError { + #[error("error with connection to local-proxy")] + Io(#[source] std::io::Error), + #[error("could not establish h2 connection")] + H2(#[from] hyper1::Error), +} + impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::ConnectionError(p) => p.get_error_kind(), + HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), + HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::GetAuthInfo(a) => a.get_error_kind(), HttpConnError::AuthError(a) => a.get_error_kind(), HttpConnError::WakeCompute(w) => w.get_error_kind(), @@ -210,7 +278,8 @@ impl UserFacingError for HttpConnError { fn to_string_client(&self) -> String { match self { HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(), - HttpConnError::ConnectionError(p) => p.to_string(), + HttpConnError::PostgresConnectionError(p) => p.to_string(), + HttpConnError::LocalProxyConnectionError(p) => p.to_string(), HttpConnError::GetAuthInfo(c) => c.to_string_client(), HttpConnError::AuthError(c) => c.to_string_client(), HttpConnError::WakeCompute(c) => c.to_string_client(), @@ -224,7 +293,8 @@ impl UserFacingError for HttpConnError { impl CouldRetry for HttpConnError { fn could_retry(&self) -> bool { match self { - HttpConnError::ConnectionError(e) => e.could_retry(), + HttpConnError::PostgresConnectionError(e) => e.could_retry(), + HttpConnError::LocalProxyConnectionError(e) => e.could_retry(), HttpConnError::ConnectionClosedAbruptly(_) => false, HttpConnError::GetAuthInfo(_) => false, HttpConnError::AuthError(_) => false, @@ -236,7 +306,7 @@ impl CouldRetry for HttpConnError { impl ShouldRetryWakeCompute for HttpConnError { fn should_retry_wake_compute(&self) -> bool { match self { - HttpConnError::ConnectionError(e) => e.should_retry_wake_compute(), + HttpConnError::PostgresConnectionError(e) => e.should_retry_wake_compute(), // we never checked cache validity HttpConnError::TooManyConnectionAttempts(_) => false, _ => true, @@ -244,6 +314,38 @@ impl ShouldRetryWakeCompute for HttpConnError { } } +impl ReportableError for LocalProxyConnError { + fn get_error_kind(&self) -> ErrorKind { + match self { + LocalProxyConnError::Io(_) => ErrorKind::Compute, + LocalProxyConnError::H2(_) => ErrorKind::Compute, + } + } +} + +impl UserFacingError for LocalProxyConnError { + fn to_string_client(&self) -> String { + "Could not establish HTTP connection to the database".to_string() + } +} + +impl CouldRetry for LocalProxyConnError { + fn could_retry(&self) -> bool { + match self { + LocalProxyConnError::Io(_) => false, + LocalProxyConnError::H2(_) => false, + } + } +} +impl ShouldRetryWakeCompute for LocalProxyConnError { + fn should_retry_wake_compute(&self) -> bool { + match self { + LocalProxyConnError::Io(_) => false, + LocalProxyConnError::H2(_) => false, + } + } +} + struct TokioMechanism { pool: Arc>, conn_info: ConnInfo, @@ -293,3 +395,99 @@ impl ConnectMechanism for TokioMechanism { fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} } + +struct HyperMechanism { + pool: Arc, + conn_info: ConnInfo, + conn_id: uuid::Uuid, + + /// connect_to_compute concurrency lock + locks: &'static ApiLocks, +} + +#[async_trait] +impl ConnectMechanism for HyperMechanism { + type Connection = http_conn_pool::Client; + type ConnectError = HttpConnError; + type Error = HttpConnError; + + async fn connect_once( + &self, + ctx: &RequestMonitoring, + node_info: &CachedNodeInfo, + timeout: Duration, + ) -> Result { + let host = node_info.config.get_host()?; + let permit = self.locks.get_permit(&host).await?; + + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + + // let port = node_info.config.get_ports().first().unwrap_or_else(10432); + let res = connect_http2(&host, 10432, timeout).await; + drop(pause); + let (client, connection) = permit.release_result(res)?; + + Ok(poll_http2_client( + self.pool.clone(), + ctx, + &self.conn_info, + client, + connection, + self.conn_id, + node_info.aux.clone(), + )) + } + + fn update_connect_config(&self, _config: &mut compute::ConnCfg) {} +} + +async fn connect_http2( + host: &str, + port: u16, + timeout: Duration, +) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> { + // assumption: host is an ip address so this should not actually perform any requests. + // todo: add that assumption as a guarantee in the control-plane API. + let mut addrs = lookup_host((host, port)) + .await + .map_err(LocalProxyConnError::Io)?; + + let mut last_err = None; + + let stream = loop { + let Some(addr) = addrs.next() else { + return Err(last_err.unwrap_or_else(|| { + LocalProxyConnError::Io(io::Error::new( + io::ErrorKind::InvalidInput, + "could not resolve any addresses", + )) + })); + }; + + match tokio::time::timeout(timeout, TcpStream::connect(addr)).await { + Ok(Ok(stream)) => { + stream.set_nodelay(true).map_err(LocalProxyConnError::Io)?; + break stream; + } + Ok(Err(e)) => { + last_err = Some(LocalProxyConnError::Io(e)); + } + Err(e) => { + last_err = Some(LocalProxyConnError::Io(io::Error::new( + io::ErrorKind::TimedOut, + e, + ))); + } + }; + }; + + let (client, connection) = hyper1::client::conn::http2::Builder::new(TokioExecutor::new()) + .timer(TokioTimer::new()) + .keep_alive_interval(Duration::from_secs(20)) + .keep_alive_while_idle(true) + .keep_alive_timeout(Duration::from_secs(5)) + .handshake(TokioIo::new(stream)) + .await?; + + Ok((client, connection)) +} diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs new file mode 100644 index 0000000000..b31ed22a7c --- /dev/null +++ b/proxy/src/serverless/http_conn_pool.rs @@ -0,0 +1,342 @@ +use dashmap::DashMap; +use hyper1::client::conn::http2; +use hyper_util::rt::{TokioExecutor, TokioIo}; +use parking_lot::RwLock; +use rand::Rng; +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::{sync::Arc, sync::Weak}; +use tokio::net::TcpStream; + +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{context::RequestMonitoring, EndpointCacheKey}; + +use tracing::{debug, error}; +use tracing::{info, info_span, Instrument}; + +use super::conn_pool::ConnInfo; + +pub(crate) type Send = http2::SendRequest; +pub(crate) type Connect = + http2::Connection, hyper1::body::Incoming, TokioExecutor>; + +#[derive(Clone)] +struct ConnPoolEntry { + conn: Send, + conn_id: uuid::Uuid, + aux: MetricsAuxInfo, +} + +// Per-endpoint connection pool +// Number of open connections is limited by the `max_conns_per_endpoint`. +pub(crate) struct EndpointConnPool { + // TODO(conrad): + // either we should open more connections depending on stream count + // (not exposed by hyper, need our own counter) + // or we can change this to an Option rather than a VecDeque. + // + // Opening more connections to the same db because we run out of streams + // seems somewhat redundant though. + // + // Probably we should run a semaphore and just the single conn. TBD. + conns: VecDeque, + _guard: HttpEndpointPoolsGuard<'static>, + global_connections_count: Arc, +} + +impl EndpointConnPool { + fn get_conn_entry(&mut self) -> Option { + let Self { conns, .. } = self; + + loop { + let conn = conns.pop_front()?; + if !conn.conn.is_closed() { + conns.push_back(conn.clone()); + return Some(conn); + } + } + } + + fn remove_conn(&mut self, conn_id: uuid::Uuid) -> bool { + let Self { + conns, + global_connections_count, + .. + } = self; + + let old_len = conns.len(); + conns.retain(|conn| conn.conn_id != conn_id); + let new_len = conns.len(); + let removed = old_len - new_len; + if removed > 0 { + global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(removed as i64); + } + removed > 0 + } +} + +impl Drop for EndpointConnPool { + fn drop(&mut self) { + if !self.conns.is_empty() { + self.global_connections_count + .fetch_sub(self.conns.len(), atomic::Ordering::Relaxed); + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(self.conns.len() as i64); + } + } +} + +pub(crate) struct GlobalConnPool { + // endpoint -> per-endpoint connection pool + // + // That should be a fairly conteded map, so return reference to the per-endpoint + // pool as early as possible and release the lock. + global_pool: DashMap>>, + + /// Number of endpoint-connection pools + /// + /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each. + /// That seems like far too much effort, so we're using a relaxed increment counter instead. + /// It's only used for diagnostics. + global_pool_size: AtomicUsize, + + /// Total number of connections in the pool + global_connections_count: Arc, + + config: &'static crate::config::HttpConfig, +} + +impl GlobalConnPool { + pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc { + let shards = config.pool_options.pool_shards; + Arc::new(Self { + global_pool: DashMap::with_shard_amount(shards), + global_pool_size: AtomicUsize::new(0), + config, + global_connections_count: Arc::new(AtomicUsize::new(0)), + }) + } + + pub(crate) fn shutdown(&self) { + // drops all strong references to endpoint-pools + self.global_pool.clear(); + } + + pub(crate) async fn gc_worker(&self, mut rng: impl Rng) { + let epoch = self.config.pool_options.gc_epoch; + let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32); + loop { + interval.tick().await; + + let shard = rng.gen_range(0..self.global_pool.shards().len()); + self.gc(shard); + } + } + + fn gc(&self, shard: usize) { + debug!(shard, "pool: performing epoch reclamation"); + + // acquire a random shard lock + let mut shard = self.global_pool.shards()[shard].write(); + + let timer = Metrics::get() + .proxy + .http_pool_reclaimation_lag_seconds + .start_timer(); + let current_len = shard.len(); + let mut clients_removed = 0; + shard.retain(|endpoint, x| { + // if the current endpoint pool is unique (no other strong or weak references) + // then it is currently not in use by any connections. + if let Some(pool) = Arc::get_mut(x.get_mut()) { + let EndpointConnPool { conns, .. } = pool.get_mut(); + + let old_len = conns.len(); + + conns.retain(|conn| !conn.conn.is_closed()); + + let new_len = conns.len(); + let removed = old_len - new_len; + clients_removed += removed; + + // we only remove this pool if it has no active connections + if conns.is_empty() { + info!("pool: discarding pool for endpoint {endpoint}"); + return false; + } + } + + true + }); + + let new_len = shard.len(); + drop(shard); + timer.observe(); + + // Do logging outside of the lock. + if clients_removed > 0 { + let size = self + .global_connections_count + .fetch_sub(clients_removed, atomic::Ordering::Relaxed) + - clients_removed; + Metrics::get() + .proxy + .http_pool_opened_connections + .get_metric() + .dec_by(clients_removed as i64); + info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}"); + } + let removed = current_len - new_len; + + if removed > 0 { + let global_pool_size = self + .global_pool_size + .fetch_sub(removed, atomic::Ordering::Relaxed) + - removed; + info!("pool: performed global pool gc. size now {global_pool_size}"); + } + } + + pub(crate) fn get( + self: &Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + ) -> Option { + let endpoint = conn_info.endpoint_cache_key()?; + let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint); + let client = endpoint_pool.write().get_conn_entry()?; + + tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id)); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); + ctx.success(); + Some(Client::new(client.conn, client.aux)) + } + + fn get_or_create_endpoint_pool( + self: &Arc, + endpoint: &EndpointCacheKey, + ) -> Arc> { + // fast path + if let Some(pool) = self.global_pool.get(endpoint) { + return pool.clone(); + } + + // slow path + let new_pool = Arc::new(RwLock::new(EndpointConnPool { + conns: VecDeque::new(), + _guard: Metrics::get().proxy.http_endpoint_pools.guard(), + global_connections_count: self.global_connections_count.clone(), + })); + + // find or create a pool for this endpoint + let mut created = false; + let pool = self + .global_pool + .entry(endpoint.clone()) + .or_insert_with(|| { + created = true; + new_pool + }) + .clone(); + + // log new global pool size + if created { + let global_pool_size = self + .global_pool_size + .fetch_add(1, atomic::Ordering::Relaxed) + + 1; + info!( + "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}" + ); + } + + pool + } +} + +pub(crate) fn poll_http2_client( + global_pool: Arc, + ctx: &RequestMonitoring, + conn_info: &ConnInfo, + client: Send, + connection: Connect, + conn_id: uuid::Uuid, + aux: MetricsAuxInfo, +) -> Client { + let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol()); + let session_id = ctx.session_id(); + + let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info(); + span.in_scope(|| { + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); + }); + + let pool = match conn_info.endpoint_cache_key() { + Some(endpoint) => { + let pool = global_pool.get_or_create_endpoint_pool(&endpoint); + + pool.write().conns.push_back(ConnPoolEntry { + conn: client.clone(), + conn_id, + aux: aux.clone(), + }); + + Arc::downgrade(&pool) + } + None => Weak::new(), + }; + + tokio::spawn( + async move { + let _conn_gauge = conn_gauge; + let res = connection.await; + match res { + Ok(()) => info!("connection closed"), + Err(e) => error!(%session_id, "connection error: {}", e), + } + + // remove from connection pool + if let Some(pool) = pool.clone().upgrade() { + if pool.write().remove_conn(conn_id) { + info!("closed connection removed"); + } + } + } + .instrument(span), + ); + + Client::new(client, aux) +} + +pub(crate) struct Client { + pub(crate) inner: Send, + aux: MetricsAuxInfo, +} + +impl Client { + pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self { + Self { inner, aux } + } + + pub(crate) fn metrics(&self) -> Arc { + USAGE_METRICS.register(Ids { + endpoint_id: self.aux.endpoint_id, + branch_id: self.aux.branch_id, + }) + } +} diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index abf0ffe290..d766a46577 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -5,13 +5,13 @@ use bytes::Bytes; use anyhow::Context; use http::{Response, StatusCode}; -use http_body_util::Full; +use http_body_util::{combinators::BoxBody, BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; /// Like [`ApiError::into_response`] -pub(crate) fn api_error_into_response(this: ApiError) -> Response> { +pub(crate) fn api_error_into_response(this: ApiError) -> Response> { match this { ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status( format!("{err:#?}"), // use debug printing so that we give the cause @@ -64,17 +64,24 @@ struct HttpErrorBody { impl HttpErrorBody { /// Same as [`utils::http::error::HttpErrorBody::response_from_msg_and_status`] - fn response_from_msg_and_status(msg: String, status: StatusCode) -> Response> { + fn response_from_msg_and_status( + msg: String, + status: StatusCode, + ) -> Response> { HttpErrorBody { msg }.to_response(status) } /// Same as [`utils::http::error::HttpErrorBody::to_response`] - fn to_response(&self, status: StatusCode) -> Response> { + fn to_response(&self, status: StatusCode) -> Response> { Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") // we do not have nested maps with non string keys so serialization shouldn't fail - .body(Full::new(Bytes::from(serde_json::to_string(self).unwrap()))) + .body( + Full::new(Bytes::from(serde_json::to_string(self).unwrap())) + .map_err(|x| match x {}) + .boxed(), + ) .unwrap() } } @@ -83,14 +90,14 @@ impl HttpErrorBody { pub(crate) fn json_response( status: StatusCode, data: T, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let json = serde_json::to_string(&data) .context("Failed to serialize JSON response") .map_err(ApiError::InternalServerError)?; let response = Response::builder() .status(status) .header(http::header::CONTENT_TYPE, "application/json") - .body(Full::new(Bytes::from(json))) + .body(Full::new(Bytes::from(json)).map_err(|x| match x {}).boxed()) .map_err(|e| ApiError::InternalServerError(e.into()))?; Ok(response) } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 7c78439a0a..f3a7ed9329 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -8,6 +8,8 @@ use futures::future::Either; use futures::StreamExt; use futures::TryFutureExt; use http::header::AUTHORIZATION; +use http::Method; +use http_body_util::combinators::BoxBody; use http_body_util::BodyExt; use http_body_util::Full; use hyper1::body::Body; @@ -38,9 +40,11 @@ use url::Url; use urlencoding; use utils::http::error::ApiError; +use crate::auth::backend::ComputeCredentials; use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; use crate::auth::ComputeUserInfoParseError; +use crate::config::AuthenticationConfig; use crate::config::ProxyConfig; use crate::config::TlsConfig; use crate::context::RequestMonitoring; @@ -56,6 +60,7 @@ use crate::usage_metrics::MetricCounterRecorder; use crate::DbName; use crate::RoleName; +use super::backend::LocalProxyConnError; use super::backend::PoolingBackend; use super::conn_pool::AuthData; use super::conn_pool::Client; @@ -123,8 +128,8 @@ pub(crate) enum ConnInfoError { MissingUsername, #[error("invalid username: {0}")] InvalidUsername(#[from] std::string::FromUtf8Error), - #[error("missing password")] - MissingPassword, + #[error("missing authentication credentials: {0}")] + MissingCredentials(Credentials), #[error("missing hostname")] MissingHostname, #[error("invalid hostname: {0}")] @@ -133,6 +138,14 @@ pub(crate) enum ConnInfoError { MalformedEndpoint, } +#[derive(Debug, thiserror::Error)] +pub(crate) enum Credentials { + #[error("required password")] + Password, + #[error("required authorization bearer token in JWT format")] + BearerJwt, +} + impl ReportableError for ConnInfoError { fn get_error_kind(&self) -> ErrorKind { ErrorKind::User @@ -146,6 +159,7 @@ impl UserFacingError for ConnInfoError { } fn get_conn_info( + config: &'static AuthenticationConfig, ctx: &RequestMonitoring, headers: &HeaderMap, tls: Option<&TlsConfig>, @@ -181,21 +195,32 @@ fn get_conn_info( ctx.set_user(username.clone()); let auth = if let Some(auth) = headers.get(&AUTHORIZATION) { + if !config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); + } + let auth = auth .to_str() .map_err(|_| ConnInfoError::InvalidHeader(&AUTHORIZATION))?; AuthData::Jwt( auth.strip_prefix("Bearer ") - .ok_or(ConnInfoError::MissingPassword)? + .ok_or(ConnInfoError::MissingCredentials(Credentials::BearerJwt))? .into(), ) } else if let Some(pass) = connection_url.password() { + // wrong credentials provided + if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); + } + AuthData::Password(match urlencoding::decode_binary(pass.as_bytes()) { std::borrow::Cow::Borrowed(b) => b.into(), std::borrow::Cow::Owned(b) => b.into(), }) + } else if config.accept_jwts { + return Err(ConnInfoError::MissingCredentials(Credentials::BearerJwt)); } else { - return Err(ConnInfoError::MissingPassword); + return Err(ConnInfoError::MissingCredentials(Credentials::Password)); }; let endpoint = match connection_url.host() { @@ -247,7 +272,7 @@ pub(crate) async fn handle( request: Request, backend: Arc, cancel: CancellationToken, -) -> Result>, ApiError> { +) -> Result>, ApiError> { let result = handle_inner(cancel, config, &ctx, request, backend).await; let mut response = match result { @@ -279,7 +304,7 @@ pub(crate) async fn handle( let mut message = e.to_string_client(); let db_error = match &e { - SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + SqlOverHttpError::ConnectCompute(HttpConnError::PostgresConnectionError(e)) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; @@ -504,7 +529,7 @@ async fn handle_inner( ctx: &RequestMonitoring, request: Request, backend: Arc, -) -> Result>, SqlOverHttpError> { +) -> Result>, SqlOverHttpError> { let _requeset_gauge = Metrics::get() .proxy .connection_requests @@ -514,18 +539,50 @@ async fn handle_inner( "handling interactive connection from client" ); - // - // Determine the destination and connection params - // - let headers = request.headers(); - - // TLS config should be there. - let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref())?; + let conn_info = get_conn_info( + &config.authentication_config, + ctx, + request.headers(), + config.tls_config.as_ref(), + )?; info!( user = conn_info.conn_info.user_info.user.as_str(), "credentials" ); + match conn_info.auth { + AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { + handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await + } + auth => { + handle_db_inner( + cancel, + config, + ctx, + request, + conn_info.conn_info, + auth, + backend, + ) + .await + } + } +} + +async fn handle_db_inner( + cancel: CancellationToken, + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + request: Request, + conn_info: ConnInfo, + auth: AuthData, + backend: Arc, +) -> Result>, SqlOverHttpError> { + // + // Determine the destination and connection params + // + let headers = request.headers(); + // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in @@ -563,26 +620,36 @@ async fn handle_inner( let authenticate_and_connect = Box::pin( async { - let keys = match &conn_info.auth { + let keys = match auth { AuthData::Password(pw) => { backend .authenticate_with_password( ctx, &config.authentication_config, - &conn_info.conn_info.user_info, - pw, + &conn_info.user_info, + &pw, ) .await? } AuthData::Jwt(jwt) => { backend - .authenticate_with_jwt(ctx, &conn_info.conn_info.user_info, jwt) - .await? + .authenticate_with_jwt( + ctx, + &config.authentication_config, + &conn_info.user_info, + jwt, + ) + .await?; + + ComputeCredentials { + info: conn_info.user_info.clone(), + keys: crate::auth::backend::ComputeCredentialKeys::None, + } } }; let client = backend - .connect_to_compute(ctx, conn_info.conn_info, keys, !allow_pool) + .connect_to_compute(ctx, conn_info, keys, !allow_pool) .await?; // not strictly necessary to mark success here, // but it's just insurance for if we forget it somewhere else @@ -640,7 +707,11 @@ async fn handle_inner( let len = json_output.len(); let response = response - .body(Full::new(Bytes::from(json_output))) + .body( + Full::new(Bytes::from(json_output)) + .map_err(|x| match x {}) + .boxed(), + ) // only fails if invalid status code or invalid header/values are given. // these are not user configurable so it cannot fail dynamically .expect("building response payload should not fail"); @@ -656,6 +727,65 @@ async fn handle_inner( Ok(response) } +static HEADERS_TO_FORWARD: &[&HeaderName] = &[ + &AUTHORIZATION, + &CONN_STRING, + &RAW_TEXT_OUTPUT, + &ARRAY_MODE, + &TXN_ISOLATION_LEVEL, + &TXN_READ_ONLY, + &TXN_DEFERRABLE, +]; + +async fn handle_auth_broker_inner( + config: &'static ProxyConfig, + ctx: &RequestMonitoring, + request: Request, + conn_info: ConnInfo, + jwt: String, + backend: Arc, +) -> Result>, SqlOverHttpError> { + backend + .authenticate_with_jwt( + ctx, + &config.authentication_config, + &conn_info.user_info, + jwt, + ) + .await + .map_err(HttpConnError::from)?; + + let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?; + + let local_proxy_uri = ::http::Uri::from_static("http://proxy.local/sql"); + + let (mut parts, body) = request.into_parts(); + let mut req = Request::builder().method(Method::POST).uri(local_proxy_uri); + + // todo(conradludgate): maybe auth-broker should parse these and re-serialize + // these instead just to ensure they remain normalised. + for &h in HEADERS_TO_FORWARD { + if let Some(hv) = parts.headers.remove(h) { + req = req.header(h, hv); + } + } + + let req = req + .body(body) + .expect("all headers and params received via hyper should be valid for request"); + + // todo: map body to count egress + let _metrics = client.metrics(); + + Ok(client + .inner + .send_request(req) + .await + .map_err(LocalProxyConnError::from) + .map_err(HttpConnError::from)? + .map(|b| b.boxed())) +} + impl QueryData { async fn process( self, @@ -705,7 +835,9 @@ impl QueryData { // query failed or was cancelled. Ok(Err(error)) => { let db_error = match &error { - SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + SqlOverHttpError::ConnectCompute( + HttpConnError::PostgresConnectionError(e), + ) | SqlOverHttpError::Postgres(e) => e.as_db_error(), _ => None, }; From 65bda19051c604edfdbdb94dabef8bc009c2c13f Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Oct 2024 01:07:43 +0300 Subject: [PATCH 11/27] Remove unnecessary dev package from compute image (#9210) libcurl4-openssl-dev is needed to build pgxn/, but libcurl4 is enough at runtime. --- compute/Dockerfile.compute-node | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 2c647a669c..8febf0aad6 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -1258,7 +1258,7 @@ RUN apt update && \ libxml2 \ libxslt1.1 \ libzstd1 \ - libcurl4-openssl-dev \ + libcurl4 \ locales \ procps \ ca-certificates \ From 651ae44569cf31a47b69b004f5444c79aa12f9cb Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 1 Oct 2024 08:57:22 +0100 Subject: [PATCH 12/27] storage controller: drop out of blocking compute notification loop if migration origin becomes unavailable (#9147) ## Problem The live migration code waits forever for the compute notification hook, on the basis that until it succeeds, the compute is probably using the old location and we shouldn't detach it. However, if a pageserver stops or restarts in the background, then this original location might no longer be available, so there is no point waiting. Waiting is also actively harmful, because it prevents other reconciliations happening for the tenant shard, such as during an upgrade where a stuck "drain" migration might prevent the later "fill" migration from moving the shard back to its original location. ## Summary of changes - Refactor the notification wait loop into a function - Add a checks during the loop, for the origin node's cancellation token and an explicit HTTP request to the origin node to confirm the shard is still attached there. Closes: https://github.com/neondatabase/neon/issues/8901 --- storage_controller/src/reconciler.rs | 136 ++++++++++++++--- storage_controller/src/service.rs | 7 +- .../regress/test_storage_controller.py | 143 ++++++++++++++++++ 3 files changed, 261 insertions(+), 25 deletions(-) diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 2c42da4043..1e7d7adffe 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -572,30 +572,7 @@ impl Reconciler { // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. - let mut notify_attempts = 0; - while let Err(e) = self.compute_notify().await { - match e { - NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), - NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), - _ => { - tracing::warn!( - "Live migration blocked by compute notification error, retrying: {e}" - ); - } - } - - exponential_backoff( - notify_attempts, - // Generous waits: control plane operations which might be blocking us usually complete on the order - // of hundreds to thousands of milliseconds, so no point busy polling. - 1.0, - 10.0, - &self.cancel, - ) - .await; - notify_attempts += 1; - } - + self.compute_notify_blocking(&origin_ps).await?; pausable_failpoint!("reconciler-live-migrate-post-notify"); // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then @@ -869,6 +846,117 @@ impl Reconciler { Ok(()) } } + + /// Keep trying to notify the compute indefinitely, only dropping out if: + /// - the node `origin` becomes unavailable -> Ok(()) + /// - the node `origin` no longer has our tenant shard attached -> Ok(()) + /// - our cancellation token fires -> Err(ReconcileError::Cancelled) + /// + /// This is used during live migration, where we do not wish to detach + /// an origin location until the compute definitely knows about the new + /// location. + /// + /// In cases where the origin node becomes unavailable, we return success, indicating + /// to the caller that they should continue irrespective of whether the compute was notified, + /// because the origin node is unusable anyway. Notification will be retried later via the + /// [`Self::compute_notify_failure`] flag. + async fn compute_notify_blocking(&mut self, origin: &Node) -> Result<(), ReconcileError> { + let mut notify_attempts = 0; + while let Err(e) = self.compute_notify().await { + match e { + NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), + NotifyError::ShuttingDown => return Err(ReconcileError::Cancel), + _ => { + tracing::warn!( + "Live migration blocked by compute notification error, retrying: {e}" + ); + } + } + + // Did the origin pageserver become unavailable? + if !origin.is_available() { + tracing::info!("Giving up on compute notification because {origin} is unavailable"); + break; + } + + // Does the origin pageserver still host the shard we are interested in? We should only + // continue waiting for compute notification to be acked if the old location is still usable. + let tenant_shard_id = self.tenant_shard_id; + match origin + .with_client_retries( + |client| async move { client.get_location_config(tenant_shard_id).await }, + &self.service_config.jwt_token, + 1, + 3, + Duration::from_secs(5), + &self.cancel, + ) + .await + { + Some(Ok(Some(location_conf))) => { + if matches!( + location_conf.mode, + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale + ) { + tracing::debug!( + "Still attached to {origin}, will wait & retry compute notification" + ); + } else { + tracing::info!( + "Giving up on compute notification because {origin} is in state {:?}", + location_conf.mode + ); + return Ok(()); + } + // Fall through + } + Some(Ok(None)) => { + tracing::info!( + "No longer attached to {origin}, giving up on compute notification" + ); + return Ok(()); + } + Some(Err(e)) => { + match e { + mgmt_api::Error::Cancelled => { + tracing::info!( + "Giving up on compute notification because {origin} is unavailable" + ); + return Ok(()); + } + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => { + tracing::info!( + "No longer attached to {origin}, giving up on compute notification" + ); + return Ok(()); + } + e => { + // Other API errors are unexpected here. + tracing::warn!("Unexpected error checking location on {origin}: {e}"); + + // Fall through, we will retry compute notification. + } + } + } + None => return Err(ReconcileError::Cancel), + }; + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; + } + + Ok(()) + } } /// We tweak the externally-set TenantConfig while configuring diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a5e0129684..851db97310 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4974,7 +4974,12 @@ impl Service { { let mut nodes_mut = (**nodes).clone(); - nodes_mut.remove(&node_id); + if let Some(mut removed_node) = nodes_mut.remove(&node_id) { + // Ensure that any reconciler holding an Arc<> to this node will + // drop out when trying to RPC to it (setting Offline state sets the + // cancellation token on the Node object). + removed_node.set_availability(NodeAvailability::Offline); + } *nodes = Arc::new(nodes_mut); } } diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 3861f0b822..789623cb27 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -567,6 +567,149 @@ def test_storage_controller_compute_hook( env.storage_controller.consistency_check() +def test_storage_controller_stuck_compute_hook( + httpserver: HTTPServer, + neon_env_builder: NeonEnvBuilder, + httpserver_listen_address, +): + """ + Test the migration process's behavior when the compute hook does not enable it to proceed + """ + + neon_env_builder.num_pageservers = 2 + (host, port) = httpserver_listen_address + neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + + handle_params = {"status": 200} + + notifications = [] + + def handler(request: Request): + status = handle_params["status"] + log.info(f"Notify request[{status}]: {request}") + notifications.append(request.json) + return Response(status=status) + + httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + + # Start running + env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) + + # Initial notification from tenant creation + assert len(notifications) == 1 + expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { + "tenant_id": str(env.initial_tenant), + "stripe_size": None, + "shards": [{"node_id": int(env.pageservers[0].id), "shard_number": 0}], + } + assert notifications[0] == expect + + # Do a migration while the compute hook is returning 423 status + tenant_id = env.initial_tenant + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + dest_pageserver = env.get_pageserver(dest_ps_id) + shard_0_id = TenantShardId(tenant_id, 0, 0) + + NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*" + env.storage_controller.allowed_errors.extend( + [ + NOTIFY_BLOCKED_LOG, + ".*Failed to notify compute.*", + ".*Reconcile error.*Cancelled", + ".*Reconcile error.*Control plane tenant busy", + ] + ) + + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + # We expect the controller to hit the 423 (locked) and retry. Migration shouldn't complete until that + # status is cleared. + handle_params["status"] = 423 + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, shard_0_id, dest_ps_id + ) + + def logged_stuck(): + env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG) + + wait_until(10, 0.25, logged_stuck) + contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG) + assert contains_r is not None # Appease mypy + (_, log_cursor) = contains_r + assert migrate_fut.running() + + # Permit the compute hook to proceed + handle_params["status"] = 200 + migrate_fut.result(timeout=10) + + # Advance log cursor past the last 'stuck' message (we already waited for one, but + # there could be more than one) + while True: + contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) + if contains_r is None: + break + else: + (_, log_cursor) = contains_r + + # Now, do a migration in the opposite direction + handle_params["status"] = 423 + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, shard_0_id, origin_pageserver.id + ) + + def logged_stuck_again(): + env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) + + wait_until(10, 0.25, logged_stuck_again) + assert migrate_fut.running() + + # This time, the compute hook remains stuck, but we mark the origin node offline: this should + # also allow the migration to complete -- we only wait for the compute hook as long as we think + # the old location is still usable for computes. + # This is a regression test for issue https://github.com/neondatabase/neon/issues/8901 + dest_pageserver.stop() + env.storage_controller.node_configure(dest_ps_id, {"availability": "Offline"}) + + try: + migrate_fut.result(timeout=10) + except StorageControllerApiException as e: + # The reconciler will fail because it can't detach from the origin: the important + # thing is that it finishes, rather than getting stuck in the compute notify loop. + assert "Reconcile error" in str(e) + + # A later background reconciliation will clean up and leave things in a neat state, even + # while the compute hook is still blocked + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # We expect that the reconciler will do its work, but be unable to fully succeed + # because it can't send a compute notification. It will complete, but leave + # the internal flag set for "retry compute notification later" + assert "Control plane tenant busy" in str(e) + + # Confirm that we are AttachedSingle on the node we last called the migrate API for + loc = origin_pageserver.http_client().tenant_get_location(shard_0_id) + assert loc["mode"] == "AttachedSingle" + + # When the origin node comes back, it should get cleaned up + dest_pageserver.start() + try: + env.storage_controller.reconcile_all() + except StorageControllerApiException as e: + # Compute hook is still blocked: reconciler will configure PS but not fully succeed + assert "Control plane tenant busy" in str(e) + + with pytest.raises(PageserverApiException, match="Tenant shard not found"): + dest_pageserver.http_client().tenant_get_location(shard_0_id) + + # Once the compute hook is unblocked, we should be able to get into a totally + # quiescent state again + handle_params["status"] = 200 + env.storage_controller.reconcile_until_idle() + + env.storage_controller.consistency_check() + + def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test From 2e508b1ff9ea1519e405a74ddb35705fe76eb696 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 1 Oct 2024 11:02:54 +0200 Subject: [PATCH 13/27] Upgrade OpenTelemetry and other tracing crates (#9200) * tracing-utils now returns a `Layer` impl. Removes the need for crates to import OTel crates. * Drop the /v1/traces URI check. Verified that the code does the right thing. * Leave a TODO to hook in an error handler for OTel to log errors to when it assumes the regular pipeline cannot be used/is broken. --- Cargo.lock | 270 +++++++++++++++------------ Cargo.toml | 13 +- compute_tools/Cargo.toml | 1 + compute_tools/src/bin/compute_ctl.rs | 2 +- compute_tools/src/logger.rs | 4 +- libs/tracing-utils/Cargo.toml | 6 +- libs/tracing-utils/src/lib.rs | 85 ++++----- proxy/Cargo.toml | 1 - proxy/src/logging.rs | 5 +- workspace_hack/Cargo.toml | 11 +- 10 files changed, 199 insertions(+), 199 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2bd828367c..2fec370b17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -90,9 +90,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.0" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" @@ -1223,6 +1223,7 @@ dependencies = [ "notify", "num_cpus", "opentelemetry", + "opentelemetry_sdk", "postgres", "regex", "remote_storage", @@ -1876,9 +1877,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.10.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" dependencies = [ "humantime", "is-terminal", @@ -3368,102 +3369,82 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.20.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54" +checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" dependencies = [ - "opentelemetry_api", - "opentelemetry_sdk", -] - -[[package]] -name = "opentelemetry-http" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b" -dependencies = [ - "async-trait", - "bytes", - "http 0.2.9", - "opentelemetry_api", - "reqwest 0.11.19", -] - -[[package]] -name = "opentelemetry-otlp" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275" -dependencies = [ - "async-trait", "futures-core", - "http 0.2.9", - "opentelemetry-http", - "opentelemetry-proto", - "opentelemetry-semantic-conventions", - "opentelemetry_api", - "opentelemetry_sdk", - "prost", - "reqwest 0.11.19", - "thiserror", - "tokio", - "tonic", -] - -[[package]] -name = "opentelemetry-proto" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb" -dependencies = [ - "opentelemetry_api", - "opentelemetry_sdk", - "prost", - "tonic", -] - -[[package]] -name = "opentelemetry-semantic-conventions" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269" -dependencies = [ - "opentelemetry", -] - -[[package]] -name = "opentelemetry_api" -version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b" -dependencies = [ - "futures-channel", - "futures-util", - "indexmap 1.9.3", + "futures-sink", "js-sys", "once_cell", "pin-project-lite", "thiserror", - "urlencoding", ] [[package]] -name = "opentelemetry_sdk" -version = "0.20.0" +name = "opentelemetry-http" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026" +checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab" +dependencies = [ + "async-trait", + "bytes", + "http 1.1.0", + "opentelemetry", + "reqwest 0.12.4", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727" +dependencies = [ + "async-trait", + "futures-core", + "http 1.1.0", + "opentelemetry", + "opentelemetry-http", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost 0.13.3", + "reqwest 0.12.4", + "thiserror", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost 0.13.3", + "tonic 0.12.2", +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05" + +[[package]] +name = "opentelemetry_sdk" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" dependencies = [ "async-trait", - "crossbeam-channel", "futures-channel", "futures-executor", "futures-util", + "glob", "once_cell", - "opentelemetry_api", - "ordered-float 3.9.2", + "opentelemetry", "percent-encoding", "rand 0.8.5", - "regex", "serde_json", "thiserror", "tokio", @@ -3479,15 +3460,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ordered-float" -version = "3.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" -dependencies = [ - "num-traits", -] - [[package]] name = "ordered-multimap" version = "0.7.3" @@ -4230,7 +4202,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.11.9", +] + +[[package]] +name = "prost" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +dependencies = [ + "bytes", + "prost-derive 0.13.3", ] [[package]] @@ -4247,7 +4229,7 @@ dependencies = [ "multimap", "petgraph", "prettyplease 0.1.25", - "prost", + "prost 0.11.9", "prost-types", "regex", "syn 1.0.109", @@ -4268,13 +4250,26 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prost-derive" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +dependencies = [ + "anyhow", + "itertools 0.12.1", + "proc-macro2", + "quote", + "syn 2.0.52", +] + [[package]] name = "prost-types" version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" dependencies = [ - "prost", + "prost 0.11.9", ] [[package]] @@ -4371,7 +4366,6 @@ dependencies = [ "tokio-tungstenite", "tokio-util", "tracing", - "tracing-opentelemetry", "tracing-subscriber", "tracing-utils", "try-lock", @@ -4816,9 +4810,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.0" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b253954a1979e02eabccd7e9c3d61d8f86576108baa160775e7f160bb4e800a3" +checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45" dependencies = [ "anyhow", "async-trait", @@ -5703,9 +5697,9 @@ dependencies = [ "metrics", "once_cell", "parking_lot 0.12.1", - "prost", + "prost 0.11.9", "tokio", - "tonic", + "tonic 0.9.2", "tonic-build", "tracing", "utils", @@ -6029,7 +6023,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", - "ordered-float 2.10.1", + "ordered-float", ] [[package]] @@ -6131,9 +6125,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.37.0" +version = "1.38.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" +checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" dependencies = [ "backtrace", "bytes", @@ -6175,9 +6169,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -6352,7 +6346,7 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost", + "prost 0.11.9", "rustls-native-certs 0.6.2", "rustls-pemfile 1.0.2", "tokio", @@ -6364,6 +6358,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f6ba989e4b2c58ae83d862d3a3e27690b6e3ae630d0deb59f3697f32aa88ad" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "percent-encoding", + "pin-project", + "prost 0.13.3", + "tokio-stream", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.9.2" @@ -6411,11 +6426,10 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.37" +version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce8c33a8d48bd45d624a6e523445fd21ec13d3653cd51f681abf67418f54eb8" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "cfg-if", "log", "pin-project-lite", "tracing-attributes", @@ -6435,9 +6449,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.24" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", @@ -6446,9 +6460,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", "valuable", @@ -6466,21 +6480,22 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ - "lazy_static", "log", + "once_cell", "tracing-core", ] [[package]] name = "tracing-opentelemetry" -version = "0.21.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8" +checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" dependencies = [ + "js-sys", "once_cell", "opentelemetry", "opentelemetry_sdk", @@ -6489,6 +6504,7 @@ dependencies = [ "tracing-core", "tracing-log", "tracing-subscriber", + "web-time", ] [[package]] @@ -6503,9 +6519,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.17" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", "once_cell", @@ -6529,6 +6545,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry-semantic-conventions", + "opentelemetry_sdk", "tokio", "tracing", "tracing-opentelemetry", @@ -6984,6 +7001,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webpki-roots" version = "0.25.2" @@ -7253,7 +7280,6 @@ dependencies = [ "chrono", "clap", "clap_builder", - "crossbeam-utils", "crypto-bigint 0.5.5", "der 0.7.8", "deranged", @@ -7286,13 +7312,12 @@ dependencies = [ "once_cell", "parquet", "proc-macro2", - "prost", + "prost 0.11.9", "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", "regex-syntax 0.8.2", - "reqwest 0.11.19", "reqwest 0.12.4", "rustls 0.21.11", "scopeguard", @@ -7313,12 +7338,9 @@ dependencies = [ "tokio-rustls 0.24.0", "tokio-util", "toml_edit", - "tonic", "tower", "tracing", "tracing-core", - "tracing-log", - "tracing-subscriber", "url", "uuid", "zeroize", diff --git a/Cargo.toml b/Cargo.toml index a788dcf3cb..ebaabab55f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,9 +116,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.20.0" -opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.12.0" +opentelemetry = "0.24" +opentelemetry_sdk = "0.24" +opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.16" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -131,7 +132,7 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] } +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] } reqwest-middleware = "0.3.0" reqwest-retry = "0.5" routerify = "3" @@ -177,8 +178,8 @@ toml_edit = "0.22" tonic = {version = "0.9", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" -tracing-error = "0.2.0" -tracing-opentelemetry = "0.21.0" +tracing-error = "0.2" +tracing-opentelemetry = "0.25" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 00a82e4be6..b6d84d7eff 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -21,6 +21,7 @@ nix.workspace = true notify.workspace = true num_cpus.workspace = true opentelemetry.workspace = true +opentelemetry_sdk.workspace = true postgres.workspace = true regex.workspace = true serde_json.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b10638c454..109d315d67 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -218,7 +218,7 @@ fn startup_context_from_env() -> Option { } if !startup_tracing_carrier.is_empty() { use opentelemetry::propagation::TextMapPropagator; - use opentelemetry::sdk::propagation::TraceContextPropagator; + use opentelemetry_sdk::propagation::TraceContextPropagator; let guard = TraceContextPropagator::new() .extract(&startup_tracing_carrier) .attach(); diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index 84be5b0809..00be5c13f9 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -1,4 +1,3 @@ -use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::prelude::*; @@ -23,8 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> { .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = - tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new); + let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl"); // Put it all together tracing_subscriber::registry() diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 05eb538d42..66f21cd1ef 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -6,12 +6,14 @@ license.workspace = true [dependencies] hyper.workspace = true -opentelemetry = { workspace = true, features=["rt-tokio"] } -opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry = { workspace = true, features = ["trace"] } +opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] } +opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } opentelemetry-semantic-conventions.workspace = true tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tracing.workspace = true tracing-opentelemetry.workspace = true +tracing-subscriber.workspace = true [dev-dependencies] tracing-subscriber.workspace = true # For examples in docs diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 9cf2495771..c4aad53cdb 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -10,7 +10,6 @@ //! //! ```rust,no_run //! use tracing_subscriber::prelude::*; -//! use tracing_opentelemetry::OpenTelemetryLayer; //! //! #[tokio::main] //! async fn main() { @@ -22,7 +21,7 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new); +//! let otlp_layer = tracing_utils::init_tracing("my_application").await; //! //! // Put it all together //! tracing_subscriber::registry() @@ -35,15 +34,15 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] -use opentelemetry::sdk::Resource; -use opentelemetry::KeyValue; -use opentelemetry_otlp::WithExportConfig; -use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}; - -pub use tracing_opentelemetry::OpenTelemetryLayer; - pub mod http; +use opentelemetry::trace::TracerProvider; +use opentelemetry::KeyValue; +use opentelemetry_sdk::Resource; +use tracing::Subscriber; +use tracing_subscriber::registry::LookupSpan; +use tracing_subscriber::Layer; + /// Set up OpenTelemetry exporter, using configuration from environment variables. /// /// `service_name` is set as the OpenTelemetry 'service.name' resource (see @@ -71,7 +70,10 @@ pub mod http; /// /// This doesn't block, but is marked as 'async' to hint that this must be called in /// asynchronous execution context. -pub async fn init_tracing(service_name: &str) -> Option { +pub async fn init_tracing(service_name: &str) -> Option> +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; @@ -80,9 +82,10 @@ pub async fn init_tracing(service_name: &str) -> Option Option { +pub fn init_tracing_without_runtime(service_name: &str) -> Option> +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; @@ -113,54 +116,36 @@ pub fn init_tracing_without_runtime( Some(init_tracing_internal(service_name.to_string())) } -fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer { - // Set up exporter from the OTEL_EXPORTER_* environment variables - let mut exporter = opentelemetry_otlp::new_exporter().http().with_env(); +fn init_tracing_internal(service_name: String) -> impl Layer +where + S: Subscriber + for<'span> LookupSpan<'span>, +{ + // Sets up exporter from the OTEL_EXPORTER_* environment variables. + let exporter = opentelemetry_otlp::new_exporter().http(); - // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the - // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the - // OpenTelemetry spec at - // , - // the full exporter URL is formed by appending "/v1/traces" to the value - // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does - // that with the grpc-tonic exporter. Other exporters, like the HTTP - // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without - // appending "/v1/traces". - // - // See https://github.com/open-telemetry/opentelemetry-rust/pull/950 - // - // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting - // the endpoint url with the "/v1/traces" path ourselves. If the bug is - // fixed in a later version, we can remove this code. But if we don't - // remember to remove this, it won't do any harm either, as the crate will - // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint - // is set directly with `with_endpoint`. - if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() { - if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) { - if !endpoint.ends_with('/') { - endpoint.push('/'); - } - endpoint.push_str("v1/traces"); - exporter = exporter.with_endpoint(endpoint); - } - } + // TODO: opentelemetry::global::set_error_handler() with custom handler that + // bypasses default tracing layers, but logs regular looking log + // messages. // Propagate trace information in the standard W3C TraceContext format. opentelemetry::global::set_text_map_propagator( - opentelemetry::sdk::propagation::TraceContextPropagator::new(), + opentelemetry_sdk::propagation::TraceContextPropagator::new(), ); - opentelemetry_otlp::new_pipeline() + let tracer = opentelemetry_otlp::new_pipeline() .tracing() .with_exporter(exporter) - .with_trace_config( - opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new( + .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( + Resource::new(vec![KeyValue::new( opentelemetry_semantic_conventions::resource::SERVICE_NAME, service_name, - )])), - ) - .install_batch(opentelemetry::runtime::Tokio) + )]), + )) + .install_batch(opentelemetry_sdk::runtime::Tokio) .expect("could not initialize opentelemetry exporter") + .tracer("global"); + + tracing_opentelemetry::layer().with_tracer(tracer) } // Shutdown trace pipeline gracefully, so that it has a chance to send any diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 04e0f9d4f5..bfeb845583 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -82,7 +82,6 @@ tokio-postgres-rustls.workspace = true tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } -tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true tracing-utils.workspace = true tracing.workspace = true diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3b30ad8b46..2e773fabb3 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,4 +1,3 @@ -use tracing_opentelemetry::OpenTelemetryLayer; use tracing_subscriber::{ filter::{EnvFilter, LevelFilter}, prelude::*, @@ -23,9 +22,7 @@ pub async fn init() -> anyhow::Result { .with_writer(std::io::stderr) .with_target(false); - let otlp_layer = tracing_utils::init_tracing("proxy") - .await - .map(OpenTelemetryLayer::new); + let otlp_layer = tracing_utils::init_tracing("proxy").await; tracing_subscriber::registry() .with(env_filter) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index e6d21e9434..16b0fe82ae 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -31,7 +31,6 @@ camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } -crossbeam-utils = { version = "0.8" } crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } @@ -51,7 +50,7 @@ hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper = { version = "0.14", features = ["full"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } @@ -68,8 +67,7 @@ rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" } -reqwest-5ef9efb8ec2df382 = { package = "reqwest", version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } -reqwest-a6292c17cd707f01 = { package = "reqwest", version = "0.11", default-features = false, features = ["blocking", "rustls-tls", "stream"] } +reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] } rustls = { version = "0.21", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } @@ -86,12 +84,9 @@ tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", tokio-rustls = { version = "0.24" } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } -tonic = { version = "0.9", features = ["tls-roots"] } tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] } tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } -tracing-log = { version = "0.1", default-features = false, features = ["log-tracer", "std"] } -tracing-subscriber = { version = "0.3", default-features = false, features = ["env-filter", "fmt", "json", "smallvec", "tracing-log"] } url = { version = "2", features = ["serde"] } uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } @@ -110,7 +105,7 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap = { version = "1", default-features = false, features = ["std"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12", default-features = false, features = ["use_std"] } +itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } From d515727e942f97f1e8ffa27bf86ef47e4506a272 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 1 Oct 2024 10:15:18 +0100 Subject: [PATCH 14/27] tests: make test_multi_attach more stable (#9202) ## Problem `test_multi_attach` is sometimes failing with `invalid compute status for configuration request: Configuration`. This is likely a result of the test attempting to reconfigure the compute at the same time as the storage controller is doing so. This test was originally written before the storage controller existed, and is not expecting anything else to be reconfiguring computes at the same time. ## Summary of changes - Configure the tenant into scheduling policy `Stop` in the storage controller at the start of the test, so that it won't try to do anything to the tenant while the test is running. --- test_runner/regress/test_pageserver_generations.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 519994f774..96521b5684 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -549,6 +549,14 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # Instruct the storage controller to not interfere with our low level configuration + # of the pageserver's attachment states. Otherwise when it sees nodes go offline+return, + # it would send its own requests that would conflict with the test's. + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.extend( + [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"] + ) + # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) From d6c6b0a509fb8bd3d62b87c6398bacdec4785069 Mon Sep 17 00:00:00 2001 From: David Gomes Date: Tue, 1 Oct 2024 11:29:56 +0200 Subject: [PATCH 15/27] feat(compute): adds pg_session_jwt extension to compute image (#8888) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem We need the [pg_session_jwt](https://github.com/neondatabase/pg_session_jwt/) extension in the compute image. This PR adds it. ## Summary of changes I added the `pg_session_jwt` extension in a very similar way to how the pggraphql and pgtiktoken extensions were added (since they're all written with pgrx). Then I tested this. ``` $ cd docker-compose/ $ PG_VERSION=16 TAG=10667533475 docker-compose up --build -d $ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres cloud_admin@postgres=# create extension pg_session_jwt; CREATE EXTENSION Time: 43.048 ms cloud_admin@postgres=# \df auth.*; List of functions ┌────────┬──────────────────┬──────────────────┬─────────────────────┬──────┐ │ Schema │ Name │ Result data type │ Argument data types │ Type │ ├────────┼──────────────────┼──────────────────┼─────────────────────┼──────┤ │ auth │ get │ jsonb │ s text │ func │ │ auth │ init │ void │ kid bigint, s jsonb │ func │ │ auth │ jwt_session_init │ void │ s text │ func │ │ auth │ user_id │ text │ │ func │ └────────┴──────────────────┴──────────────────┴─────────────────────┴──────┘ (4 rows) cloud_admin@postgres=# select auth.init(cast('1' as bigint), to_jsonb(TEXT '{ "kty": "EC", "kid": "571683be-33cf-4e67-bccc-8905c0ebb862", "crv": "P-521", "alg": "ES512", "x": "AM_GsnQvKML2yXdn_OsN8PdgO1Sf9XMXih5vQMKLmJkp-Iz_FFWJUt6uyR_qp4brr8Ji2kjGJgN4cQJpg2kskH7V", "y": "AZg-salw24lCmsBP-BCBa5jT6INkTwLtCOC7o0BIxDVvmIEH1-PQAJVYVJPTFvPMi_PLa0QlOm-ufJYkynwa2Mau" }')); ERROR: called `Result::unwrap()` on an `Err` value: Error("invalid type: string \"{ \\\"kty\\\": \\\"EC\\\", \\\"kid\\\": \\\"571683be-33cf-4e67-bccc-8905c0ebb862\\\", \\\"crv\\\": \\\"P-521\\\", \\\"alg\\\": \\\"ES512\\\", \\\"x\\\": \\\"AM_GsnQvKML2yXdn_OsN8PdgO1Sf9XMXih5vQMKLmJkp-Iz_FFWJUt6uyR_qp4brr8Ji2kjGJgN4cQJpg2kskH7V\\\", \\\"y\\\": \\\"AZg-salw24lCmsBP-BCBa5jT6INkTwLtCOC7o0BIxDVvmIEH1-PQAJVYVJPTFvPMi_PLa0QlOm-ufJYkynwa2Mau\\\" }\", expected struct JwkEcKey", line: 0, column: 0) Time: 6.991 ms ``` ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Move the download location to a proper URL --- compute/Dockerfile.compute-node | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 8febf0aad6..b70f0abd27 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -871,6 +871,28 @@ RUN case "${PG_VERSION}" in "v17") \ cargo pgrx install --release && \ echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control +######################################################################################### +# +# Layer "pg-session-jwt-build" +# Compile "pg_session_jwt" extension +# +######################################################################################### + +FROM rust-extensions-build AS pg-session-jwt-build +ARG PG_VERSION + +RUN case "${PG_VERSION}" in "v17") \ + echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ + esac && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ + echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ + sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release + # it's needed to enable extension because it uses untrusted C language + # sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_session_jwt.control && \ + # echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_session_jwt.control + ######################################################################################### # # Layer "wal2json-build" @@ -967,6 +989,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ From 40b10b878a678c900d4151ae86a73bd17c0b3dfa Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 1 Oct 2024 10:34:39 +0100 Subject: [PATCH 16/27] storage_scrubber: retry on index deletion failures (#9204) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem In automated tests running on AWS S3, we frequently see scrubber failures when it can't delete an index. `location_conf_churn`: https://neon-github-public-dev.s3.amazonaws.com/reports/main/11076221056/index.html#/testresult/f89b1916b6a693e2 `scrubber_physical_gc`: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9178/11074269153/index.html#/testresult/9885ed5aa0fe38b6 ## Summary of changes Wrap index deletion in a backoff::retry --------- Co-authored-by: Arpad Müller --- .../src/pageserver_physical_gc.rs | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index c96d9cad3b..1e69ddbf15 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -4,7 +4,7 @@ use std::time::Duration; use crate::checks::{list_timeline_blobs, BlobDataParseResult}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; -use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId}; +use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; @@ -18,6 +18,7 @@ use serde::Serialize; use storage_controller_client::control_api; use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; +use utils::backoff; use utils::generation::Generation; use utils::id::{TenantId, TenantTimelineId}; @@ -326,15 +327,25 @@ async fn maybe_delete_index( } // All validations passed: erase the object - match remote_client - .delete(&obj.key, &CancellationToken::new()) - .await + let cancel = CancellationToken::new(); + match backoff::retry( + || remote_client.delete(&obj.key, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "maybe_delete_index", + &cancel, + ) + .await { - Ok(_) => { + None => { + unreachable!("Using a dummy cancellation token"); + } + Some(Ok(_)) => { tracing::info!("Successfully deleted index"); summary.indices_deleted += 1; } - Err(e) => { + Some(Err(e)) => { tracing::warn!("Failed to delete index: {e}"); summary.remote_storage_errors += 1; } From 4391b25d01135b1c2f9f97f0754510c59f6e19c4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 1 Oct 2024 10:36:49 +0100 Subject: [PATCH 17/27] proxy: ignore typ and use jwt.alg rather than jwk.alg (#9215) Microsoft exposes JWKs without the alg header. It's only included on the tokens. Not a problem. Also noticed that wrt the `typ` header: > It will typically not be used by applications when it is already known that the object is a JWT. This parameter is ignored by JWT implementations; any processing of this parameter is performed by the JWT application. Since we know we are expecting JWTs only, I've followed the guidance and removed the validation. --- proxy/src/auth/backend/jwt.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 38dd30ce92..b62a11ccb2 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -261,10 +261,6 @@ impl JwkCacheEntryLock { let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) .context("Provided authentication token is not a valid JWT encoding")?; - ensure!( - header.typ == "JWT", - "Provided authentication token is not a valid JWT encoding" - ); let kid = header.key_id.context("missing key id")?; let mut guard = self @@ -299,7 +295,7 @@ impl JwkCacheEntryLock { verify_ec_signature(header_payload.as_bytes(), &sig, key)?; } jose_jwk::Key::Rsa(key) => { - verify_rsa_signature(header_payload.as_bytes(), &sig, key, &jwk.prm.alg)?; + verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } key => bail!("unsupported key type {key:?}"), }; @@ -381,7 +377,7 @@ fn verify_rsa_signature( data: &[u8], sig: &[u8], key: &jose_jwk::Rsa, - alg: &Option, + alg: &jose_jwa::Algorithm, ) -> anyhow::Result<()> { use jose_jwa::{Algorithm, Signing}; use rsa::{ @@ -392,7 +388,7 @@ fn verify_rsa_signature( let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; match alg { - Some(Algorithm::Signing(Signing::Rs256)) => { + Algorithm::Signing(Signing::Rs256) => { let key = VerifyingKey::::new(key); let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; @@ -406,9 +402,6 @@ fn verify_rsa_signature( /// #[derive(serde::Deserialize, serde::Serialize)] struct JwtHeader<'a> { - /// must be "JWT" - #[serde(rename = "typ")] - typ: &'a str, /// must be a supported alg #[serde(rename = "alg")] algorithm: jose_jwa::Algorithm, @@ -592,7 +585,6 @@ mod tests { key: jose_jwk::Key::Ec(pk), prm: jose_jwk::Parameters { kid: Some(kid), - alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), ..Default::default() }, }; @@ -606,7 +598,6 @@ mod tests { key: jose_jwk::Key::Rsa(pk), prm: jose_jwk::Parameters { kid: Some(kid), - alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), ..Default::default() }, }; @@ -615,7 +606,6 @@ mod tests { fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { let header = JwtHeader { - typ: "JWT", algorithm: jose_jwa::Algorithm::Signing(sig), key_id: Some(&kid), }; From 1b8b50755c3e51a57bda7e4bf8202c3fd7ae8fe9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Oct 2024 15:09:09 +0300 Subject: [PATCH 18/27] Use debian packages for cmake again (#9212) On bookworm, 'cmake' is new enough that we can just use it. On bullseye, we can get a new-enough package from backports. By including 'cmake' in the build-deps stage, we don't need to install it separately in all the later build stages that need it. See https://github.com/neondatabase/neon/pull/2699, where we switched to downloading and building a specific version. --- compute/Dockerfile.compute-node | 53 ++++++++++++--------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index b70f0abd27..eb4682445c 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -12,10 +12,25 @@ ARG DEBIAN_FLAVOR=bullseye-slim ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps ARG DEBIAN_FLAVOR -RUN apt update && \ + +RUN case $DEBIAN_FLAVOR in \ + # Version-specific installs for Bullseye (PG14-PG16): + # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. + # Install newer version (3.25) from backports. + bullseye*) \ + echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + ;; \ + # Version-specific installs for Bookworm (PG17): + bookworm*) \ + VERSION_INSTALLS="cmake"; \ + ;; \ + esac && \ + apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ + $VERSION_INSTALLS ######################################################################################### # @@ -89,7 +104,7 @@ FROM build-deps AS postgis-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + apt install -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc @@ -200,27 +215,6 @@ FROM build-deps AS h3-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "$(uname -m)" in \ - "x86_64") \ - export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \ - ;; \ - "aarch64") \ - export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \ - ;; \ - *) \ - echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \ - ;; \ - esac && \ - wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \ - -q -O /tmp/cmake-install.sh \ - && echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \ - && chmod u+x /tmp/cmake-install.sh \ - && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ - && rm /tmp/cmake-install.sh - RUN case "${PG_VERSION}" in "v17") \ mkdir -p /h3/usr/ && \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -506,8 +500,6 @@ RUN case "${PG_VERSION}" in "v17") \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ esac && \ - apt-get update && \ - apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \ @@ -596,7 +588,6 @@ RUN case "${PG_VERSION}" in "v17") \ esac && \ apt-get update && \ apt-get install -y \ - cmake \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ @@ -761,7 +752,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ - apt-get install -y curl libclang-dev cmake && \ + apt-get install -y curl libclang-dev && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -1177,11 +1168,6 @@ RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -# cmake is required for the h3 test -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && apt-get install -y cmake RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -1208,7 +1194,6 @@ ENV PGDATABASE=postgres ######################################################################################### FROM debian:$DEBIAN_FLAVOR ARG DEBIAN_FLAVOR -ENV DEBIAN_FLAVOR=$DEBIAN_FLAVOR # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ From 0d500bbd5bbd2ebc78f23888339fffcee4ecf4bd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Oct 2024 17:38:19 +0300 Subject: [PATCH 19/27] Add new compute metrics to sql exporter (#9190) These are the perf counters added in commit 263dfba6ee. Note: This relies on 'neon' extension version 1.5. The default was bumped to 1.5 in commit d696c41807306333dab3568da523937963f7a116. --------- Co-authored-by: Matthias van de Meent --- compute/etc/neon_collector.yml | 85 ++++++++++++++++++++++++++++++++++ pgxn/neon/neon_perf_counters.c | 2 +- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml index acb17d3cc0..f088f4c398 100644 --- a/compute/etc/neon_collector.yml +++ b/compute/etc/neon_collector.yml @@ -94,6 +94,68 @@ metrics: query: | select sum(pg_database_size(datname)) as total from pg_database; +- metric_name: getpage_wait_seconds_count + type: counter + help: 'Number of getpage requests' + values: [getpage_wait_seconds_count] + query_ref: neon_perf_counters + +- metric_name: getpage_wait_seconds_sum + type: counter + help: 'Time spent in getpage requests' + values: [getpage_wait_seconds_sum] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_requests_total + type: counter + help: 'Number of getpage issued for prefetching' + values: [getpage_prefetch_requests_total] + query_ref: neon_perf_counters + +- metric_name: getpage_sync_requests_total + type: counter + help: 'Number of synchronous getpage issued' + values: [getpage_sync_requests_total] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_misses_total + type: counter + help: 'Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' + values: [getpage_prefetch_misses_total] + query_ref: neon_perf_counters + +- metric_name: getpage_prefetch_discards_total + type: counter + help: 'Number of prefetch responses issued but not used' + values: [getpage_prefetch_discards_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_requests_sent_total + type: counter + help: 'Number of all requests sent to the pageserver (not just GetPage requests)' + values: [pageserver_requests_sent_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_disconnects_total + type: counter + help: 'Number of times that the connection to the pageserver was lost' + values: [pageserver_disconnects_total] + query_ref: neon_perf_counters + +- metric_name: pageserver_send_flushes_total + type: counter + help: 'Number of flushes to the pageserver connection' + values: [pageserver_send_flushes_total] + query_ref: neon_perf_counters + +- metric_name: getpage_wait_seconds_buckets + type: counter + help: 'Histogram buckets of getpage request latency' + key_labels: + - bucket_le + values: [value] + query_ref: getpage_wait_seconds_buckets + # DEPRECATED - metric_name: lfc_approximate_working_set_size type: gauge @@ -244,3 +306,26 @@ metrics: SELECT slot_name, CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost FROM pg_replication_slots; + +queries: + - query_name: neon_perf_counters + query: | + WITH c AS ( + SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters + ) + SELECT d.* + FROM pg_catalog.jsonb_to_record((select jb from c)) as d( + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric + ); + + - query_name: getpage_wait_seconds_buckets + query: | + SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index de653826c0..eecbfe98c6 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -137,7 +137,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters) metrics[i].is_bucket = false; metrics[i].value = (double) counters->pageserver_requests_sent_total; i++; - metrics[i].name = "pageserver_requests_disconnects_total"; + metrics[i].name = "pageserver_disconnects_total"; metrics[i].is_bucket = false; metrics[i].value = (double) counters->pageserver_disconnects_total; i++; From 49f99eb7295ed8c704ea9f1554b4baa014d727e8 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 1 Oct 2024 10:56:54 -0400 Subject: [PATCH 20/27] docs: add aux file v2 RFC (#9115) aux v2 migration is near the end and I rewrote the RFC based on what I proposed (several months before...) and what I actually implemented. --------- Signed-off-by: Alex Chi Z --- docs/rfcs/038-aux-file-v2.md | 112 +++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 docs/rfcs/038-aux-file-v2.md diff --git a/docs/rfcs/038-aux-file-v2.md b/docs/rfcs/038-aux-file-v2.md new file mode 100644 index 0000000000..9c3c336008 --- /dev/null +++ b/docs/rfcs/038-aux-file-v2.md @@ -0,0 +1,112 @@ +# AUX file v2 + +## Summary + +This is a retrospective RFC describing a new storage strategy for AUX files. + +## Motivation + +The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`. +Every time the compute node streams a `neon-file` record to the pageserver, it will +update the aux file hash map, and then write the serialized hash map into the key. +This creates serious space bloat. There was a fix to log delta records (i.e., update +a key in the hash map) to the aux file key. In this way, the pageserver only stores +the deltas at each of the LSNs. However, this improved v1 storage strategy still +requires us to store everything in an aux file cache in memory, because we cannot +fetch a single key (or file) from the compound `AUX_FILES_KEY`. + +### Prior art + +For storing large amount of small files, we can use a key-value store where the key +is the filename and the value is the file content. + +## Requirements + +- No space bloat, fixed space amplification. +- No write bloat, fixed write amplification. + +## Impacted Components + +pageserver + +## Sparse Keyspace + +In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF +exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior +assumption, there are code that traverses the keyspace by iterating every single key. + +```rust +loop { + // do something + key = key.next(); +} +``` + +If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run. +Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would +exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead, +they should fetch all the layer files in the key range, and then do a merge of them. + +In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`. + +## AUX v2 Keyspace and Key Mapping + +Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the +keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash +of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`. + +For example, `pg_logical/mappings/test1` will be encoded as: + +``` +62 0000 01 01 7F8B83D94F7081693471ABF91C +^ aux prefix + ^ assigned prefix of pg_logical/ + ^ assigned prefix of mappings/ + ^ 13B FNV hash of test1 + ^ not used due to key representation +``` + +The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace. + +Note that inside pageserver, there are two representations of the keys: the 18B full key representation +and the 16B compact key representation. For the 18B representation, some fields have restricted ranges +of values. Therefore, the aux keys only use the 16B compact portion of the full key. + +It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of +each of the aux key is an array that contains all filenames and file content that should be stored in +this key. + +We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before, +and we do not need addition code to support reconstructing the value. We simply get the latest image from +the storage. + +## Inbound Logical Replication Key Mapping + +For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data. +This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during +generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace. + +## Sparse Keyspace Read Path + +There are two places we need to read the aux files from the pageserver: + +* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that. +* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. + +## Compaction and Image Layer Generation + +With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage. + +* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes. +* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later. + +## Migration + +We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration). + +During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached +with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2). + +If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program. + +The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes. From b675997f48ff9782c3928ee435fb318f6a7b31ff Mon Sep 17 00:00:00 2001 From: Shinya Kato <37682778+shinyaaa@users.noreply.github.com> Date: Wed, 2 Oct 2024 00:16:53 +0900 Subject: [PATCH 21/27] safekeeper: Fix a log message of HTTP worker (#9213) ## Problem There is a wrong log message. ## Summary of changes Fixed the log message. --- safekeeper/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 2e11a279ca..3116d88dff 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -161,7 +161,7 @@ pub static HTTP_RUNTIME: Lazy = Lazy::new(|| { .thread_name("HTTP worker") .enable_all() .build() - .expect("Failed to create WAL service runtime") + .expect("Failed to create HTTP runtime") }); pub static BROKER_RUNTIME: Lazy = Lazy::new(|| { From ce73db93162f7e2975f05d3fe90658042c39fb7c Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 1 Oct 2024 16:28:58 +0100 Subject: [PATCH 22/27] Fix post_apply_config() (#9220) Bring back post_apply_config() step that was accidentally removed in 78938d1 --- compute_tools/src/compute.rs | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 147eb2a161..2f6e2bdb2c 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1060,19 +1060,26 @@ impl ComputeNode { let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; let config_time = Utc::now(); - if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates { - let pgdata_path = Path::new(&self.pgdata); - // temporarily reset max_cluster_size in config - // to avoid the possibility of hitting the limit, while we are applying config: - // creating new extensions, roles, etc... - config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { + if pspec.spec.mode == ComputeMode::Primary { + if !pspec.spec.skip_pg_catalog_updates { + let pgdata_path = Path::new(&self.pgdata); + // temporarily reset max_cluster_size in config + // to avoid the possibility of hitting the limit, while we are applying config: + // creating new extensions, roles, etc... + config::with_compute_ctl_tmp_override( + pgdata_path, + "neon.max_cluster_size=-1", + || { + self.pg_reload_conf()?; + + self.apply_config(&compute_state)?; + + Ok(()) + }, + )?; self.pg_reload_conf()?; - - self.apply_config(&compute_state)?; - - Ok(()) - })?; - self.pg_reload_conf()?; + } + self.post_apply_config()?; } let startup_end_time = Utc::now(); From 325de52e73460f4187ccee941d07008374860530 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 1 Oct 2024 17:35:18 +0200 Subject: [PATCH 23/27] pageserver: remove `TenantConfOpt::TryFrom` (#9219) Following #7656, `TenantConfOpt::TryFrom` appears to be dead code. This patch removes `TenantConfOpt::TryFrom`. The code does appear to be dead, since the TOML config is deserialized into `TenantConfig` (via `LocationConfig`) and then converted into `TenantConfOpt`. This was verified by adding a panic to `try_from()` and running the pageserver unit tests as well as a local end-to-end cluster (including creating a new tenant and restarting the pageserver). This did not fail, so this is not used on the common happy path at least. No explicit `try_from` or `try_into` calls were found either. Resolves #8918. --- pageserver/src/tenant/config.rs | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 547b43a399..502cb62fe8 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -8,7 +8,6 @@ //! We cannot use global or default config instead, because wrong settings //! may lead to a data loss. //! -use anyhow::bail; pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithmSettings; @@ -441,29 +440,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { } } -impl TryFrom for TenantConfOpt { - type Error = anyhow::Error; - - fn try_from(item: toml_edit::Item) -> Result { - match item { - toml_edit::Item::Value(value) => { - let d = value.into_deserializer(); - return serde_path_to_error::deserialize(d) - .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); - } - toml_edit::Item::Table(table) => { - let deserializer = - toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table)); - return serde_path_to_error::deserialize(deserializer) - .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message())); - } - _ => { - bail!("expected non-inline table but found {item}") - } - } - } -} - /// This is a conversion from our internal tenant config object to the one used /// in external APIs. impl From for models::TenantConfig { From 6efdb1d0f3ab0e95c7958078251efeb1956fe2dc Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 1 Oct 2024 18:37:59 +0200 Subject: [PATCH 24/27] Fix small memory accounting bug in libpagestore (#9223) Found while searching for other issues in shared memory. The bug should be benign, in that it over-allocates memory for this struct, but doesn't allow for out-of-bounds writes. --- pgxn/neon/libpagestore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 07a19a7114..6c6489277d 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -937,7 +937,7 @@ PagestoreShmemInit(void) LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); pagestore_shared = ShmemInitStruct("libpagestore shared state", - PagestoreShmemSize(), + sizeof(PagestoreShmemState), &found); if (!found) { From 17672c88ff1074722df36d543f34696c9c291732 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 1 Oct 2024 20:54:00 +0300 Subject: [PATCH 25/27] tests: wait walreceiver on sks to be gone on 'immediate' ep restart. (#9099) When endpoint is stopped in immediate mode and started again there is a chance of old connection delivering some WAL to safekeepers after second start checked need for sync-safekeepers and thus grabbed basebackup LSN. It makes basebackup unusable, so compute panics. Avoid flakiness by waiting for walreceivers on safekeepers to be gone in such cases. A better way would be to bump term on safekeepers if sync-safekeepers is skipped, but it needs more infrastructure. ref https://github.com/neondatabase/neon/issues/9079 --- test_runner/fixtures/neon_fixtures.py | 31 +++++++++++++++---- test_runner/fixtures/safekeeper/utils.py | 17 +++++++--- test_runner/regress/test_next_xid.py | 4 ++- test_runner/regress/test_replica_start.py | 6 ++-- .../regress/test_subscriber_restart.py | 4 +-- test_runner/regress/test_vm_bits.py | 2 +- test_runner/regress/test_wal_acceptor.py | 10 +++--- 7 files changed, 52 insertions(+), 22 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f5019e39dc..6a53a34bc9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -20,7 +20,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum from fcntl import LOCK_EX, LOCK_UN, flock -from functools import cached_property, partial +from functools import cached_property from itertools import chain, product from pathlib import Path from types import TracebackType @@ -86,7 +86,7 @@ from fixtures.remote_storage import ( remote_storage_to_toml_dict, ) from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, @@ -4100,12 +4100,26 @@ class Endpoint(PgProtocol, LogUtils): with open(remote_extensions_spec_path, "w") as file: json.dump(spec, file, indent=4) - def stop(self, mode: str = "fast") -> "Endpoint": + def stop( + self, + mode: str = "fast", + sks_wait_walreceiver_gone: Optional[tuple[List[Safekeeper], TimelineId]] = None, + ) -> "Endpoint": """ Stop the Postgres instance if it's running. - Because test teardown might try and stop an endpoint concurrently with test code - stopping the endpoint, this method is thread safe + Because test teardown might try and stop an endpoint concurrently with + test code stopping the endpoint, this method is thread safe + + If sks_wait_walreceiever_gone is not None, wait for the safekeepers in + this list to have no walreceivers, i.e. compute endpoint connection be + gone. When endpoint is stopped in immediate mode and started again this + avoids race of old connection delivering some data after + sync-safekeepers check, which makes basebackup unusable. TimelineId is + needed because endpoint doesn't know it. + + A better solution would be bump term when sync-safekeepers is skipped on + start, see #9079. Returns self. """ @@ -4117,6 +4131,11 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, check_return_code=self.check_stop_result, mode=mode ) + if sks_wait_walreceiver_gone is not None: + for sk in sks_wait_walreceiver_gone[0]: + cli = sk.http_client() + wait_walreceivers_absent(cli, self.tenant_id, sks_wait_walreceiver_gone[1]) + return self def stop_and_destroy(self, mode: str = "immediate") -> "Endpoint": @@ -5209,7 +5228,7 @@ def flush_ep_to_pageserver( for sk in env.safekeepers: cli = sk.http_client() # wait until compute connections are gone - wait_until(30, 0.5, partial(are_walreceivers_absent, cli, tenant, timeline)) + wait_walreceivers_absent(cli, tenant, timeline) commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) # Note: depending on WAL filtering implementation, probably most shards diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 0e4b5d7883..2a081c6ccb 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -1,11 +1,20 @@ from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.utils import wait_until -def are_walreceivers_absent( +def wait_walreceivers_absent( sk_http_cli: SafekeeperHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): - status = sk_http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") - return len(status.walreceivers) == 0 + """ + Wait until there is no walreceiver connections from the compute(s) on the + safekeeper. + """ + + def walreceivers_absent(): + status = sk_http_cli.timeline_status(tenant_id, timeline_id) + log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") + assert len(status.walreceivers) == 0 + + wait_until(30, 0.5, walreceivers_absent) diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index 51e847135e..cac74492d7 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -435,7 +435,9 @@ $$; # Wait until pageserver has received all the data, and restart the endpoint wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) - endpoint.stop(mode="immediate") # 'immediate' to avoid writing shutdown checkpoint + endpoint.stop( + mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id) + ) # 'immediate' to avoid writing shutdown checkpoint endpoint.start() # Check that the next-multixid value wrapped around correctly diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index 0d95109d6b..d5e92b92d1 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -103,6 +103,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): # Initialize the primary, a test table, and a helper function to create lots # of subtransactions. env = neon_simple_env + timeline_id = env.initial_timeline primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") primary_conn = primary.connect() primary_cur = primary_conn.cursor() @@ -114,7 +115,7 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): # chance to write abort records for them. primary_cur.execute("begin") primary_cur.execute("select create_subxacts(100000)") - primary.stop(mode="immediate") + primary.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) # Restart the primary. Do some light work, and shut it down cleanly primary.start() @@ -659,6 +660,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): # Initialize the primary and a test table env = neon_simple_env + timeline_id = env.initial_timeline primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") with primary.cursor() as primary_cur: primary_cur.execute("create table t(pk serial primary key, payload integer)") @@ -667,7 +669,7 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv): with primary.cursor() as primary_cur: primary_cur.execute("insert into t (payload) values (0)") # restart primary - primary.stop("immediate") + primary.stop("immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) primary.start() # Wait for the WAL to be flushed diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 91caad7220..647a2e6b14 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -13,7 +13,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): pub = env.endpoints.create("publisher") pub.start() - env.neon_cli.create_branch("subscriber") + sub_timeline_id = env.neon_cli.create_branch("subscriber") sub = env.endpoints.create("subscriber") sub.start() @@ -47,7 +47,7 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): for _ in range(n_restarts): # restart subscriber # time.sleep(2) - sub.stop("immediate") + sub.stop("immediate", sks_wait_walreceiver_gone=(env.safekeepers, sub_timeline_id)) sub.start() thread.join() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 3075211ada..ae1b6fdab3 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -247,7 +247,7 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): # in a "clean" way. Our neon extension will write a full-page image of the VM # page, and we want to avoid that. A clean shutdown will also not do, for the # same reason. - endpoint.stop(mode="immediate") + endpoint.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) endpoint.start() pg_conn = endpoint.connect() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index c75235a04b..25c66c3cae 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -47,7 +47,7 @@ from fixtures.remote_storage import ( s3_storage, ) from fixtures.safekeeper.http import SafekeeperHttpClient -from fixtures.safekeeper.utils import are_walreceivers_absent +from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( PropagatingThread, get_dir_size, @@ -1061,6 +1061,7 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): # https://github.com/neondatabase/neon/issues/8911 def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() + timeline_id = env.initial_timeline endpoint = env.endpoints.create_start("main") @@ -1070,7 +1071,7 @@ def test_restart_endpoint_after_switch_wal(neon_env_builder: NeonEnvBuilder): # we want immediate shutdown to have endpoint restart on xlog switch record, # so prevent shutdown checkpoint. - endpoint.stop(mode="immediate") + endpoint.stop(mode="immediate", sks_wait_walreceiver_gone=(env.safekeepers, timeline_id)) endpoint = env.endpoints.create_start("main") endpoint.safe_psql("SELECT 'works'") @@ -1222,10 +1223,7 @@ def wait_flush_lsn_align_by_ep(env, branch, tenant_id, timeline_id, ep, sks): # Even if there is no compute, there might be some in flight data; ensure # all walreceivers die before rechecking. for sk_http_cli in sk_http_clis: - wait( - partial(are_walreceivers_absent, sk_http_cli, tenant_id, timeline_id), - "walreceivers to be gone", - ) + wait_walreceivers_absent(sk_http_cli, tenant_id, timeline_id) # Now recheck again flush_lsn and exit if it is good if is_flush_lsn_aligned(sk_http_clis, tenant_id, timeline_id): return From 62e22dfd8530ccfed827949499716d76b5aca9a8 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 1 Oct 2024 20:55:05 +0300 Subject: [PATCH 26/27] Backpressure: reset ps display after it is done. (#8980) Previously we set the 'backpressure throttling' status, but overwrote current one and never reset it back. --- pgxn/neon/walproposer_pg.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index bb65a11c7d..89d4cb061f 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -422,6 +422,9 @@ backpressure_throttling_impl(void) TimestampTz start, stop; bool retry = false; + char *new_status = NULL; + const char *old_status; + int len; if (PointerIsValid(PrevProcessInterruptsCallback)) retry = PrevProcessInterruptsCallback(); @@ -442,14 +445,24 @@ backpressure_throttling_impl(void) if (lag == 0) return retry; - /* Suspend writers until replicas catch up */ - set_ps_display("backpressure throttling"); + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 64 + 1); + memcpy(new_status, old_status, len); + snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag); + set_ps_display(new_status); + new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */ elog(DEBUG2, "backpressure throttling: lag %lu", lag); start = GetCurrentTimestamp(); pg_usleep(BACK_PRESSURE_DELAY); stop = GetCurrentTimestamp(); pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start); + + /* Reset ps display */ + set_ps_display(new_status); + pfree(new_status); + return true; } From 8861e8a323843610d47646db9a4e4a9217924f61 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Oct 2024 22:07:51 +0300 Subject: [PATCH 27/27] Fix the size of the perf counters shared memory array (#9226) MaxBackends doesn't include auxiliary processes. Whenever an aux process made IO operations that updated the counters, they would scribble over shared memory beoynd the end of the array. The relsize cache hash table comes after the array, so the symptom was an error about hash table corruption in the relsize cache hash. --- pgxn/neon/neon_perf_counters.c | 9 +++++---- pgxn/neon/neon_perf_counters.h | 8 ++++++++ pgxn/neon/pagestore_smgr.c | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index eecbfe98c6..9bce81bf2e 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -27,7 +27,8 @@ NeonPerfCountersShmemSize(void) { Size size = 0; - size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters))); + size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS, + sizeof(neon_per_backend_counters))); return size; } @@ -39,7 +40,7 @@ NeonPerfCountersShmemInit(void) neon_per_backend_counters_shared = ShmemInitStruct("Neon perf counters", - mul_size(MaxBackends, + mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters)), &found); Assert(found == IsUnderPostmaster); @@ -192,7 +193,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) /* We put all the tuples into a tuplestore in one go. */ InitMaterializedSRF(fcinfo, 0); - for (int procno = 0; procno < MaxBackends; procno++) + for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { PGPROC *proc = GetPGProcByNumber(procno); int pid = proc->pid; @@ -231,7 +232,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); /* Aggregate the counters across all backends */ - for (int procno = 0; procno < MaxBackends; procno++) + for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 02163ada55..49d477c4f8 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -96,6 +96,14 @@ typedef struct /* Pointer to the shared memory array of neon_per_backend_counters structs */ extern neon_per_backend_counters *neon_per_backend_counters_shared; +/* + * Size of the perf counters array in shared memory. One slot for each backend + * and aux process. IOW one for each PGPROC slot, except for slots reserved + * for prepared transactions, because they're not real processes and cannot do + * I/O. + */ +#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS) + #if PG_VERSION_NUM >= 170000 #define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber]) #else diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1c87f4405c..155756f8b3 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1773,6 +1773,20 @@ neon_init(void) if (MyPState != NULL) return; + /* + * Sanity check that theperf counters array is sized correctly. We got + * this wrong once, and the formula for max number of backends and aux + * processes might well change in the future, so better safe than sorry. + * This is a very cheap check so we do it even without assertions. On + * v14, this gets called before initializing MyProc, so we cannot perform + * the check here. That's OK, we don't expect the logic to change in old + * releases. + */ +#if PG_VERSION_NUM>=150000 + if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) + elog(ERROR, "MyNeonCounters points past end of array"); +#endif + prfs_size = offsetof(PrefetchState, prf_buffer) + sizeof(PrefetchRequest) * readahead_buffer_size;