From ae15acdee7d435d8fc61036227dde02ca7fa7462 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 4 Apr 2024 13:28:22 +0300 Subject: [PATCH 1/4] Fix bug in prefetch cleanup (#7277) ## Problem Running test_pageserver_restarts_under_workload in POR #7275 I get the following assertion failure in prefetch: ``` #5 0x00005587220d4bf0 in ExceptionalCondition ( conditionName=0x7fbf24d003c8 "(ring_index) < MyPState->ring_unused && (ring_index) >= MyPState->ring_last", fileName=0x7fbf24d00240 "/home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c", lineNumber=644) at /home/knizhnik/neon.main//vendor/postgres-v16/src/backend/utils/error/assert.c:66 #6 0x00007fbf24cebc9b in prefetch_set_unused (ring_index=1509) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:644 #7 0x00007fbf24cec613 in prefetch_register_buffer (tag=..., force_latest=0x0, force_lsn=0x0) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:891 #8 0x00007fbf24cef21e in neon_prefetch (reln=0x5587233b7388, forknum=MAIN_FORKNUM, blocknum=14110) at /home/knizhnik/neon.main//pgxn/neon/pagestore_smgr.c:2055 (gdb) p ring_index $1 = 1509 (gdb) p MyPState->ring_unused $2 = 1636 (gdb) p MyPState->ring_last $3 = 1636 ``` ## Summary of changes Check status of `prefetch_wait_for` ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/libpagestore.c | 21 +++++++++++---------- pgxn/neon/pagestore_smgr.c | 18 +++++++++++------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1bc8a2e87c..2276b4e807 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -495,16 +495,17 @@ retry: static void pageserver_disconnect(shardno_t shard_no) { - if (page_servers[shard_no].conn) - { - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); - } + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + * + * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, + * because prefetch request may be registered before connection is established. + */ + prefetch_on_ps_disconnect(); + pageserver_disconnect_shard(shard_no); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index b33cfab2bb..57a16e00ca 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -806,7 +805,8 @@ Retry: { if (*force_lsn > slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -821,7 +821,8 @@ Retry: { if (*force_lsn != slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -887,7 +888,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ + Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); From 7ce613354e5230ab51a81ddb092c52d9e13810f3 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Thu, 4 Apr 2024 12:29:10 +0200 Subject: [PATCH 2/4] Fix length (#7308) ## Problem Bug ## Summary of changes Use `compressed_data.len()` instead of `data.len()`. --- proxy/src/usage_metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 2ad0883fb0..b21056735d 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -461,7 +461,7 @@ async fn upload_events_chunk( || async { let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); storage - .upload(stream, data.len(), remote_path, None, cancel) + .upload(stream, compressed_data.len(), remote_path, None, cancel) .await }, TimeoutOrCancel::caused_by_cancel, From 375e15815c2d4adc6b435dafeb1218ad47c28a6a Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 4 Apr 2024 12:22:08 +0100 Subject: [PATCH 3/4] storage controller: grant 'admin' access to all APIs (#7307) ## Problem Currently, using `storcon-cli` requires user to select a token with either `pageserverapi` or `admin` scope depending on which endpoint they're using. ## Summary of changes - In check_permissions, permit access with the admin scope even if the required scope is missing. The effect is that an endpoint that required `pageserverapi` now accepts either `pageserverapi` or `admin`, and for the CLI one can simply use an `admin` scope token for everything. --- control_plane/attachment_service/src/http.rs | 10 +++++++++- test_runner/regress/test_sharding_service.py | 7 ++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/control_plane/attachment_service/src/http.rs b/control_plane/attachment_service/src/http.rs index 03883f0ca2..c59bcaa174 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/control_plane/attachment_service/src/http.rs @@ -602,9 +602,17 @@ where .await } +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { check_permission_with(request, |claims| { - crate::auth::check_permission(claims, required_scope) + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } }) } diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_sharding_service.py index 7df0b58596..233d3b9603 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_sharding_service.py @@ -724,13 +724,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): StorageControllerApiException, match="Forbidden: JWT authentication error", ): - svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) # Token with correct scope svc.request( "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) ) + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + # No token with pytest.raises( StorageControllerApiException, From 9d754e984f81dbaaf996f2f19e5756847dc8f508 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 4 Apr 2024 13:41:04 +0100 Subject: [PATCH 4/4] storage_controller: setup sentry reporting (#7311) ## Problem No alerting for storage controller is in place. ## Summary of changes Set up sentry for the storage controller. --- control_plane/attachment_service/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/control_plane/attachment_service/src/main.rs b/control_plane/attachment_service/src/main.rs index bd8d7f5c59..5150468537 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/control_plane/attachment_service/src/main.rs @@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; +use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); @@ -158,6 +159,8 @@ fn main() -> anyhow::Result<()> { std::process::exit(1); })); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections.