From 38384c37ac50f04fe992795c27f0d5cf4291c2a5 Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:15:14 +0400 Subject: [PATCH 01/28] Make node deletion context-aware (#12494) ## Problem Deletion process does not calculate preferred nodes correctly - it doesn't consider current tenant-shard layout among all pageservers. ## Summary of changes - Added a schedule context calculation for node deletion Co-authored-by: Aleksandr Sarantsev --- storage_controller/src/service.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 9360225396..403ae15b59 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7208,6 +7208,12 @@ impl Service { let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); + // Calculate a schedule context here to avoid borrow checker issues. + let mut schedule_context = ScheduleContext::default(); + for (_, shard) in tenants.range(TenantShardId::tenant_range(tid.tenant_id)) { + schedule_context.avoid(&shard.intent.all_pageservers()); + } + let tenant_shard = match tenants.get_mut(&tid) { Some(tenant_shard) => tenant_shard, None => { @@ -7233,9 +7239,6 @@ impl Service { } if tenant_shard.deref_node(node_id) { - // TODO(ephemeralsad): we should process all shards in a tenant at once, so - // we can avoid settling the tenant unevenly. - let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal); if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { tracing::error!( "Refusing to delete node, shard {} can't be rescheduled: {e}", From 7458d031b1f7adfc6cbc652cc0903d58141f6746 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Jul 2025 15:59:45 +0200 Subject: [PATCH 02/28] clippy: fix unfounded warning on macOS (#12501) Before this PR, macOS builds would get clippy warning ``` warning: `tokio_epoll_uring::thread_local_system` does not refer to an existing function ``` The reason is that the `thread_local_system` function is only defined on Linux. Add `allow-invalid = true` to make macOS clippy pass, and manually test that on Linux builds, clippy still fails when we use it. refs - https://databricks.slack.com/archives/C09254R641L/p1751917655527099 Co-authored-by: Christian Schwarz --- clippy.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/clippy.toml b/clippy.toml index 408232488c..c03059053a 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,9 +1,12 @@ disallowed-methods = [ "tokio::task::block_in_place", + # Allow this for now, to deny it later once we stop using Handle::block_on completely # "tokio::runtime::Handle::block_on", - # use tokio_epoll_uring_ext instead - "tokio_epoll_uring::thread_local_system", + + # tokio-epoll-uring: + # - allow-invalid because the method doesn't exist on macOS + { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true } ] disallowed-macros = [ From f72115d0a99c148953a15ec5fbf5a49f86ca741f Mon Sep 17 00:00:00 2001 From: Mikhail Date: Tue, 8 Jul 2025 15:37:24 +0100 Subject: [PATCH 03/28] Endpoint storage openapi spec (#12361) https://github.com/neondatabase/cloud/issues/19011 --- endpoint_storage/src/app.rs | 2 + endpoint_storage/src/openapi_spec.yml | 146 ++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 endpoint_storage/src/openapi_spec.yml diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 42431c0066..a7a18743ef 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -13,6 +13,8 @@ use utils::backoff::retry; pub fn app(state: Arc) -> Router<()> { use axum::routing::{delete as _delete, get as _get}; let delete_prefix = _delete(delete_prefix); + // NB: On any changes do not forget to update the OpenAPI spec + // in /endpoint_storage/src/openapi_spec.yml. Router::new() .route( "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", diff --git a/endpoint_storage/src/openapi_spec.yml b/endpoint_storage/src/openapi_spec.yml new file mode 100644 index 0000000000..8d9abf902c --- /dev/null +++ b/endpoint_storage/src/openapi_spec.yml @@ -0,0 +1,146 @@ +openapi: "3.0.2" +info: + title: Endpoint Storage API + description: Endpoint Storage API + version: "1.0" + license: + name: "Apache" + url: https://github.com/neondatabase/neon/blob/main/LICENSE +servers: + - url: "" +paths: + /status: + description: Healthcheck endpoint + get: + description: Healthcheck + security: [] + responses: + "200": + description: OK + + /{tenant_id}/{timeline_id}/{endpoint_id}/{key}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + - name: key + in: path + required: true + schema: + type: string + get: + description: Get file from blob storage + responses: + "200": + description: "File stream from blob storage" + content: + application/octet-stream: + schema: + type: string + format: binary + "400": + description: File was not found + "403": + description: JWT does not authorize request to this route + put: + description: Insert file into blob storage. If file exists, override it + requestBody: + content: + application/octet-stream: + schema: + type: string + format: binary + responses: + "200": + description: File was inserted successfully + "403": + description: JWT does not authorize request to this route + delete: + description: Delete file from blob storage + responses: + "200": + description: File was successfully deleted or not found + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}/{endpoint_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + - name: endpoint_id + in: path + required: true + schema: + type: string + delete: + description: Delete endpoint data from blob storage + responses: + "200": + description: Endpoint data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}/{timeline_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + - name: timeline_id + in: path + required: true + schema: + type: string + delete: + description: Delete timeline data from blob storage + responses: + "200": + description: Timeline data was deleted + "403": + description: JWT does not authorize request to this route + + /{tenant_id}: + parameters: + - name: tenant_id + in: path + required: true + schema: + type: string + delete: + description: Delete tenant data from blob storage + responses: + "200": + description: Tenant data was deleted + "403": + description: JWT does not authorize request to this route + +components: + securitySchemes: + JWT: + type: http + scheme: bearer + bearerFormat: JWT + +security: + - JWT: [] From 8a042fb8ed62d56641836577fb01ccee4103dfb8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Jul 2025 17:03:32 +0200 Subject: [PATCH 04/28] refactor(compaction): eliminate `CompactionError::Offload` variant, map to `::Other` (#12505) Looks can be deceiving: the match blocks in `maybe_trip_compaction_breaker` and at the end of `compact_with_options` seem like differentiated error handling, but in reality, these branches are unreachable at runtime because the only source of `CompactionError::Offload` within the compaction code is at the end of `Tenant::compaction_iteration`. We can simply map offload cancellation to CompactionError::Cancelled and all other offload errors to ::Other, since there's no differentiated handling for them in the compaction code. Also, the OffloadError::RemoteStorage variant has no differentiated handling, but was wrapping the remote storage anyhow::Error in a `anyhow(thiserror(anyhow))` sandwich. This PR removes that variant, mapping all RemoteStorage errors to `OffloadError::Other`. Thereby, the sandwich is gone and we will get a proper anyhow backtrace to the remote storage error location if when we debug-print the OffloadError (or the CompactionError if we map it to that). refs - https://databricks.atlassian.net/browse/LKB-182 - the observation that there's no need for differentiated handling of CompactionError::Offload was made in https://databricks.slack.com/archives/C09254R641L/p1751286453930269?thread_ts=1751284317.955159&cid=C09254R641L --- pageserver/src/http/routes.rs | 1 - pageserver/src/tenant.rs | 7 +++---- pageserver/src/tenant/tasks.rs | 1 - pageserver/src/tenant/timeline.rs | 17 ----------------- pageserver/src/tenant/timeline/offload.rs | 4 +--- 5 files changed, 4 insertions(+), 26 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 02094e6aa9..23a090045a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2501,7 +2501,6 @@ async fn timeline_checkpoint_handler( .map_err(|e| match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 9aabd6341f..ad767a1672 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3288,7 +3288,9 @@ impl TenantShard { // Ignore this, we likely raced with unarchival. OffloadError::NotArchived => Ok(()), OffloadError::AlreadyInProgress => Ok(()), - err => Err(err), + OffloadError::Cancelled => Err(CompactionError::ShuttingDown), + // don't break the anyhow chain + OffloadError::Other(err) => Err(CompactionError::Other(err)), })?; } @@ -3319,9 +3321,6 @@ impl TenantShard { match err { err if err.is_cancel() => {} CompactionError::ShuttingDown => (), - // Offload failures don't trip the circuit breaker, since they're cheap to retry and - // shouldn't block compaction. - CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. self.compaction_circuit_breaker diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..2ba1ad2674 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - Offload(_) => Level::ERROR, AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index aeced98859..c2b49c0296 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -40,7 +40,6 @@ use layer_manager::{ Shutdown, }; -use offload::OffloadError; use once_cell::sync::Lazy; use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use pageserver_api::key::{ @@ -2078,9 +2077,6 @@ impl Timeline { // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. self.compaction_failed.store(true, AtomicOrdering::Relaxed) } - // Don't change the current value on offload failure or shutdown. We don't want to - // abruptly stall nor resume L0 flushes in these cases. - Err(CompactionError::Offload(_)) => {} }; result @@ -6017,9 +6013,6 @@ impl Drop for Timeline { pub(crate) enum CompactionError { #[error("The timeline or pageserver is shutting down")] ShuttingDown, - /// Compaction tried to offload a timeline and failed - #[error("Failed to offload timeline: {0}")] - Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. #[error("Failed to collect keyspace: {0}")] CollectKeySpaceError(#[from] CollectKeySpaceError), @@ -6040,7 +6033,6 @@ impl CompactionError { | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( PageReconstructError::Cancelled )) - | Self::Offload(OffloadError::Cancelled) ) } @@ -6058,15 +6050,6 @@ impl CompactionError { } } -impl From for CompactionError { - fn from(e: OffloadError) -> Self { - match e { - OffloadError::Cancelled => Self::ShuttingDown, - _ => Self::Offload(e), - } - } -} - impl From for CompactionError { fn from(value: super::upload_queue::NotInitialized) -> Self { match value { diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 9464f034c7..e9cf2e9aa7 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -17,8 +17,6 @@ pub(crate) enum OffloadError { Cancelled, #[error("Timeline is not archived")] NotArchived, - #[error(transparent)] - RemoteStorage(anyhow::Error), #[error("Offload or deletion already in progress")] AlreadyInProgress, #[error("Unexpected offload error: {0}")] @@ -29,7 +27,7 @@ impl From for OffloadError { fn from(e: TenantManifestError) -> Self { match e { TenantManifestError::Cancelled => Self::Cancelled, - TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e), + TenantManifestError::RemoteStorage(e) => Self::Other(e), } } } From 29d73e140453cc18acf96475c8a1dbf57f354468 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 8 Jul 2025 17:49:42 +0200 Subject: [PATCH 05/28] http-utils: Temporarily accept duplicate params (#12504) ## Problem Grafana Alloy in cluster mode seems to send duplicate "seconds" scrape URL parameters when one of its instances is disrupted. ## Summary of changes Temporarily accept duplicate parameters as long as their value is identical. --- libs/http-utils/src/request.rs | 68 ++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs index 9024a90a82..afb2ae8f47 100644 --- a/libs/http-utils/src/request.rs +++ b/libs/http-utils/src/request.rs @@ -41,17 +41,35 @@ pub fn get_query_param<'a>( Some(q) => q, None => return Ok(None), }; - let mut values = url::form_urlencoded::parse(query.as_bytes()) + let values = url::form_urlencoded::parse(query.as_bytes()) .filter_map(|(k, v)| if k == param_name { Some(v) } else { None }) // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards .fuse(); - let value1 = values.next(); - if values.next().is_some() { - return Err(ApiError::BadRequest(anyhow!( - "param {param_name} specified more than once" - ))); - } + // Work around an issue with Alloy's pyroscope scrape where the "seconds" + // parameter is added several times. https://github.com/grafana/alloy/issues/3026 + // TODO: revert after Alloy is fixed. + let value1 = values + .map(Ok) + .reduce(|acc, i| { + match acc { + Err(_) => acc, + + // It's okay to have duplicates as along as they have the same value. + Ok(ref a) if a == &i.unwrap() => acc, + + _ => Err(ApiError::BadRequest(anyhow!( + "param {param_name} specified more than once" + ))), + } + }) + .transpose()?; + // if values.next().is_some() { + // return Err(ApiError::BadRequest(anyhow!( + // "param {param_name} specified more than once" + // ))); + // } + Ok(value1) } @@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> None => Ok(()), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_query_param_duplicate() { + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=1") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert_eq!(value.unwrap(), "1"); + + let req = Request::builder() + .uri("http://localhost:12345/testuri") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam").unwrap(); + assert!(value.is_none()); + + let req = Request::builder() + .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3") + .body(hyper::Body::empty()) + .unwrap(); + let value = get_query_param(&req, "testparam"); + assert!(value.is_err()); + } +} From f9b05a42d7a408bc98f485463c13f58496101160 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Jul 2025 18:45:34 +0200 Subject: [PATCH 06/28] refactor(compaction): remove `CompactionError::AlreadyRunning` variant, use `::Other` instead (#12512) The only call stack that can emit the `::AlreadyRunning` variant is ``` -> iteration_inner -> iteration -> compaction_iteration -> compaction_loop -> start_background_loops ``` And on that call stack, the only differentiated handling of it is its invocations of `log_compaction_error -> CompactionError::is_cancel()`, which returns `true` for `::AlreadyRunning`. I think the condition of `AlreadyRunning` is severe; it really shouldn't happen. So, this PR starts treating it as something that is to be logged at `ERROR` / `WARN` level, depending on the `degrate_to_warning` argument to `log_compaction_error`. refs - https://databricks.atlassian.net/browse/LKB-182 --- pageserver/src/http/routes.rs | 3 +-- pageserver/src/tenant.rs | 1 - pageserver/src/tenant/tasks.rs | 1 - pageserver/src/tenant/timeline.rs | 6 ------ pageserver/src/tenant/timeline/compaction.rs | 8 ++++---- 5 files changed, 5 insertions(+), 14 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 23a090045a..55582659df 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2503,7 +2503,6 @@ async fn timeline_checkpoint_handler( CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), - CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -3697,7 +3696,7 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) + // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) // and we don't need to worry about it for now. let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ad767a1672..49b92915da 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3334,7 +3334,6 @@ impl TenantShard { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } - CompactionError::AlreadyRunning(_) => {} } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 2ba1ad2674..356f495972 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c2b49c0296..296b922599 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2067,9 +2067,6 @@ impl Timeline { Err(CompactionError::ShuttingDown) => { // Covered by the `Err(e) if e.is_cancel()` branch. } - Err(CompactionError::AlreadyRunning(_)) => { - // Covered by the `Err(e) if e.is_cancel()` branch. - } Err(CompactionError::Other(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } @@ -6018,8 +6015,6 @@ pub(crate) enum CompactionError { CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), - #[error("Compaction already running: {0}")] - AlreadyRunning(&'static str), } impl CompactionError { @@ -6028,7 +6023,6 @@ impl CompactionError { matches!( self, Self::ShuttingDown - | Self::AlreadyRunning(_) | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( PageReconstructError::Cancelled diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ac3930fb71..2c0b98c1e2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -572,7 +572,7 @@ impl GcCompactionQueue { match res { Ok(res) => Ok(res), Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(_) => { + Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -591,9 +591,9 @@ impl GcCompactionQueue { timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { - return Err(CompactionError::AlreadyRunning( - "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", - )); + return Err(CompactionError::Other(anyhow::anyhow!( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue." + ))); }; let has_pending_tasks; let mut yield_for_l0 = false; From 477ab12b691be1e44f557d45dbe64294009259d4 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 8 Jul 2025 17:46:55 +0100 Subject: [PATCH 07/28] pageserver: touch up broker subscription reset (#12503) ## Problem The goal of this code was to test out if resetting the broker subscription helps alleviate the issues we've been seeing in staging. Looks like it did the trick. However, the original version was too eager. ## Summary of Changes Only reset the stream when: * we are waiting for WAL * there's no connection candidates lined up * we're not already connected to a safekeeper --- .../timeline/walreceiver/connection_manager.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 9b151d2449..aba94244a3 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -182,12 +182,19 @@ pub(super) async fn connection_manager_loop_step( } }, + // If we've not received any updates from the broker from a while, are waiting for WAL + // and have no safekeeper connection or connection candidates, then it might be that + // the broker subscription is wedged. Drop the currrent subscription and re-subscribe + // with the goal of unblocking it. _ = broker_reset_interval.tick() => { - if wait_lsn_status.borrow().is_some() { - tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...") - } + let awaiting_lsn = wait_lsn_status.borrow().is_some(); + let no_candidates = connection_manager_state.wal_stream_candidates.is_empty(); + let no_connection = connection_manager_state.wal_connection.is_none(); - broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + if awaiting_lsn && no_candidates && no_connection { + tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ..."); + broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?; + } }, new_event = async { From a06c560ad05ecec0c13901f97916807259665bfa Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 8 Jul 2025 12:55:00 -0400 Subject: [PATCH 08/28] feat(pageserver): critical path feature flags (#12449) ## Problem Some feature flags are used heavily on the critical path and we want the "get feature flag" operation as cheap as possible. ## Summary of changes Add a `test_remote_size_flag` as an example of such flags. In the future, we can use macro to generate all those fields. The flag is updated in the housekeeping loop. The retrieval of the flag is simply reading an atomic flag. --------- Signed-off-by: Alex Chi Z --- pageserver/src/feature_resolver.rs | 29 ++++++++++++++++++++++++----- pageserver/src/tenant.rs | 8 ++++---- pageserver/src/tenant/timeline.rs | 6 +++--- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 6ce4522080..65cac8eea1 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,4 +1,8 @@ -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + sync::{Arc, atomic::AtomicBool}, + time::Duration, +}; use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; @@ -355,11 +359,17 @@ impl PerTenantProperties { } } -#[derive(Clone)] pub struct TenantFeatureResolver { inner: FeatureResolver, tenant_id: TenantId, - cached_tenant_properties: Arc>>, + cached_tenant_properties: ArcSwap>, + + // Add feature flag on the critical path below. + // + // If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of + // resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the + // housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions. + pub feature_test_remote_size_flag: AtomicBool, } impl TenantFeatureResolver { @@ -367,7 +377,8 @@ impl TenantFeatureResolver { Self { inner, tenant_id, - cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), + cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())), + feature_test_remote_size_flag: AtomicBool::new(false), } } @@ -396,7 +407,8 @@ impl TenantFeatureResolver { self.inner.is_feature_flag_boolean(flag_key) } - pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) { + /// Refresh the cached properties and flags on the critical path. + pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { let mut remote_size_mb = None; for timeline in tenant_shard.list_timelines() { let size = timeline.metrics.resident_physical_size_get(); @@ -410,5 +422,12 @@ impl TenantFeatureResolver { self.cached_tenant_properties.store(Arc::new( PerTenantProperties { remote_size_mb }.into_posthog_properties(), )); + + // BEGIN: Update the feature flag on the critical path. + self.feature_test_remote_size_flag.store( + self.evaluate_boolean("test-remote-size-flag").is_ok(), + std::sync::atomic::Ordering::Relaxed, + ); + // END: Update the feature flag on the critical path. } } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 49b92915da..96ed4672a6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -388,7 +388,7 @@ pub struct TenantShard { l0_flush_global_state: L0FlushGlobalState, - pub(crate) feature_resolver: TenantFeatureResolver, + pub(crate) feature_resolver: Arc, } impl std::fmt::Debug for TenantShard { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -3411,7 +3411,7 @@ impl TenantShard { } // Update the feature resolver with the latest tenant-spcific data. - self.feature_resolver.update_cached_tenant_properties(self); + self.feature_resolver.refresh_properties_and_flags(self); } pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { @@ -4500,10 +4500,10 @@ impl TenantShard { gc_block: Default::default(), l0_flush_global_state, basebackup_cache, - feature_resolver: TenantFeatureResolver::new( + feature_resolver: Arc::new(TenantFeatureResolver::new( feature_resolver, tenant_shard_id.tenant_id, - ), + )), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 296b922599..44a4f1e911 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -201,7 +201,7 @@ pub struct TimelineResources { pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, pub basebackup_cache: Arc, - pub feature_resolver: TenantFeatureResolver, + pub feature_resolver: Arc, } pub struct Timeline { @@ -449,7 +449,7 @@ pub struct Timeline { /// A channel to send async requests to prepare a basebackup for the basebackup cache. basebackup_cache: Arc, - feature_resolver: TenantFeatureResolver, + feature_resolver: Arc, } pub(crate) enum PreviousHeatmap { @@ -3122,7 +3122,7 @@ impl Timeline { basebackup_cache: resources.basebackup_cache, - feature_resolver: resources.feature_resolver, + feature_resolver: resources.feature_resolver.clone(), }; result.repartition_threshold = From 81e7218c278288cd8e95ed3f50f4e4a423391dc8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 8 Jul 2025 20:15:13 +0200 Subject: [PATCH 09/28] pageserver: tighten up gRPC `page_api::Client` (#12396) This patch tightens up `page_api::Client`. It's mostly superficial changes, but also adds a new constructor that takes an existing gRPC channel, for use with the communicator connection pool. --- compute_tools/src/compute.rs | 2 +- compute_tools/src/lsn_lease.rs | 2 +- pageserver/page_api/src/client.rs | 327 +++++++++--------- pageserver/pagebench/src/cmd/basebackup.rs | 2 +- .../pagebench/src/cmd/getpage_latest_lsn.rs | 2 +- 5 files changed, 159 insertions(+), 176 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index e870cecc58..f25aff1110 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1057,7 +1057,7 @@ impl ComputeNode { }; let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( shard0_connstr, spec.tenant_id, spec.timeline_id, diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 3346c18c0d..bb0828429d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc( lsn: Lsn, ) -> Result> { tokio::runtime::Handle::current().block_on(async move { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), tenant_shard_id.tenant_id, timeline_id, diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 65e41540b8..6523d00d3d 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -1,23 +1,151 @@ -use anyhow::Result; +use anyhow::Context as _; use futures::{Stream, StreamExt as _, TryStreamExt as _}; use tokio::io::AsyncRead; use tokio_util::io::StreamReader; +use tonic::codec::CompressionEncoding; use tonic::metadata::AsciiMetadataValue; -use tonic::metadata::errors::InvalidMetadataValue; -use tonic::transport::Channel; -use tonic::{Request, Streaming}; +use tonic::service::Interceptor; +use tonic::service::interceptor::InterceptedService; +use tonic::transport::{Channel, Endpoint}; -use utils::id::TenantId; -use utils::id::TimelineId; +use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; -use crate::model; +use crate::model::*; use crate::proto; -/// -/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These -/// headers are required at the pageserver. -/// +/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain +/// types from `model` rather than generated Protobuf types. +pub struct Client { + inner: proto::PageServiceClient>, +} + +impl Client { + /// Connects to the given gRPC endpoint. + pub async fn connect( + endpoint: E, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?; + let channel = endpoint.connect().await?; + Self::new( + channel, + tenant_id, + timeline_id, + shard_id, + auth_token, + compression, + ) + } + + /// Creates a new client using the given gRPC channel. + pub fn new( + channel: Channel, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + compression: Option, + ) -> anyhow::Result { + let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?; + let mut inner = proto::PageServiceClient::with_interceptor(channel, auth); + + if let Some(compression) = compression { + // TODO: benchmark this (including network latency). + inner = inner + .accept_compressed(compression) + .send_compressed(compression); + } + + Ok(Self { inner }) + } + + /// Returns whether a relation exists. + pub async fn check_rel_exists( + &mut self, + req: CheckRelExistsRequest, + ) -> tonic::Result { + let req = proto::CheckRelExistsRequest::from(req); + let resp = self.inner.check_rel_exists(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches a base backup. + pub async fn get_base_backup( + &mut self, + req: GetBaseBackupRequest, + ) -> tonic::Result> { + let req = proto::GetBaseBackupRequest::from(req); + let chunks = self.inner.get_base_backup(req).await?.into_inner(); + Ok(StreamReader::new( + chunks + .map_ok(|resp| resp.chunk) + .map_err(std::io::Error::other), + )) + } + + /// Returns the total size of a database, as # of bytes. + pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result { + let req = proto::GetDbSizeRequest::from(req); + let resp = self.inner.get_db_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches pages. + /// + /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are + /// typically returned as status_code instead of errors, to avoid tearing down the entire stream + /// via a tonic::Status error. + pub async fn get_pages( + &mut self, + reqs: impl Stream + Send + 'static, + ) -> tonic::Result> + Send + 'static> { + let reqs = reqs.map(proto::GetPageRequest::from); + let resps = self.inner.get_pages(reqs).await?.into_inner(); + Ok(resps.map_ok(GetPageResponse::from)) + } + + /// Returns the size of a relation, as # of blocks. + pub async fn get_rel_size( + &mut self, + req: GetRelSizeRequest, + ) -> tonic::Result { + let req = proto::GetRelSizeRequest::from(req); + let resp = self.inner.get_rel_size(req).await?.into_inner(); + Ok(resp.into()) + } + + /// Fetches an SLRU segment. + pub async fn get_slru_segment( + &mut self, + req: GetSlruSegmentRequest, + ) -> tonic::Result { + let req = proto::GetSlruSegmentRequest::from(req); + let resp = self.inner.get_slru_segment(req).await?.into_inner(); + Ok(resp.try_into()?) + } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result { + let req = proto::LeaseLsnRequest::from(req); + let resp = self.inner.lease_lsn(req).await?.into_inner(); + Ok(resp.try_into()?) + } +} + +/// Adds authentication metadata to gRPC requests. #[derive(Clone)] struct AuthInterceptor { tenant_id: AsciiMetadataValue, @@ -30,174 +158,29 @@ impl AuthInterceptor { fn new( tenant_id: TenantId, timeline_id: TimelineId, - auth_token: Option, shard_id: ShardIndex, - ) -> Result { - let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?; - let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?; - let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?; - - let auth_header: Option = match auth_token { - Some(token) => Some(format!("Bearer {token}").try_into()?), - None => None, - }; - + auth_token: Option, + ) -> anyhow::Result { Ok(Self { - tenant_id: tenant_ascii, - shard_id: shard_ascii, - timeline_id: timeline_ascii, - auth_header, + tenant_id: tenant_id.to_string().try_into()?, + timeline_id: timeline_id.to_string().try_into()?, + shard_id: shard_id.to_string().try_into()?, + auth_header: auth_token + .map(|token| format!("Bearer {token}").try_into()) + .transpose()?, }) } } -impl tonic::service::Interceptor for AuthInterceptor { - fn call(&mut self, mut req: tonic::Request<()>) -> Result, tonic::Status> { - req.metadata_mut() - .insert("neon-tenant-id", self.tenant_id.clone()); - req.metadata_mut() - .insert("neon-shard-id", self.shard_id.clone()); - req.metadata_mut() - .insert("neon-timeline-id", self.timeline_id.clone()); - if let Some(auth_header) = &self.auth_header { - req.metadata_mut() - .insert("authorization", auth_header.clone()); +impl Interceptor for AuthInterceptor { + fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result> { + let metadata = req.metadata_mut(); + metadata.insert("neon-tenant-id", self.tenant_id.clone()); + metadata.insert("neon-timeline-id", self.timeline_id.clone()); + metadata.insert("neon-shard-id", self.shard_id.clone()); + if let Some(ref auth_header) = self.auth_header { + metadata.insert("authorization", auth_header.clone()); } Ok(req) } } - -#[derive(Clone)] -pub struct Client { - client: proto::PageServiceClient< - tonic::service::interceptor::InterceptedService, - >, -} - -impl Client { - pub async fn new + Send + Sync + 'static>( - into_endpoint: T, - tenant_id: TenantId, - timeline_id: TimelineId, - shard_id: ShardIndex, - auth_header: Option, - compression: Option, - ) -> anyhow::Result { - let endpoint: tonic::transport::Endpoint = into_endpoint - .try_into() - .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?; - let channel = endpoint.connect().await?; - let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id) - .map_err(|e| anyhow::anyhow!(e.to_string()))?; - let mut client = proto::PageServiceClient::with_interceptor(channel, auth); - - if let Some(compression) = compression { - // TODO: benchmark this (including network latency). - client = client - .accept_compressed(compression) - .send_compressed(compression); - } - - Ok(Self { client }) - } - - /// Returns whether a relation exists. - pub async fn check_rel_exists( - &mut self, - req: model::CheckRelExistsRequest, - ) -> Result { - let proto_req = proto::CheckRelExistsRequest::from(req); - - let response = self.client.check_rel_exists(proto_req).await?; - - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches a base backup. - pub async fn get_base_backup( - &mut self, - req: model::GetBaseBackupRequest, - ) -> Result, tonic::Status> { - let req = proto::GetBaseBackupRequest::from(req); - let chunks = self.client.get_base_backup(req).await?.into_inner(); - let reader = StreamReader::new( - chunks - .map_ok(|resp| resp.chunk) - .map_err(std::io::Error::other), - ); - Ok(reader) - } - - /// Returns the total size of a database, as # of bytes. - pub async fn get_db_size( - &mut self, - req: model::GetDbSizeRequest, - ) -> Result { - let proto_req = proto::GetDbSizeRequest::from(req); - - let response = self.client.get_db_size(proto_req).await?; - Ok(response.into_inner().into()) - } - - /// Fetches pages. - /// - /// This is implemented as a bidirectional streaming RPC for performance. - /// Per-request errors are often returned as status_code instead of errors, - /// to avoid tearing down the entire stream via tonic::Status. - pub async fn get_pages( - &mut self, - inbound: ReqSt, - ) -> Result< - impl Stream> + Send + 'static, - tonic::Status, - > - where - ReqSt: Stream + Send + 'static, - { - let outbound_proto = inbound.map(|domain_req| domain_req.into()); - - let req_new = Request::new(outbound_proto); - - let response_stream: Streaming = - self.client.get_pages(req_new).await?.into_inner(); - - let domain_stream = response_stream.map_ok(model::GetPageResponse::from); - - Ok(domain_stream) - } - - /// Returns the size of a relation, as # of blocks. - pub async fn get_rel_size( - &mut self, - req: model::GetRelSizeRequest, - ) -> Result { - let proto_req = proto::GetRelSizeRequest::from(req); - let response = self.client.get_rel_size(proto_req).await?; - let proto_resp = response.into_inner(); - Ok(proto_resp.into()) - } - - /// Fetches an SLRU segment. - pub async fn get_slru_segment( - &mut self, - req: model::GetSlruSegmentRequest, - ) -> Result { - let proto_req = proto::GetSlruSegmentRequest::from(req); - let response = self.client.get_slru_segment(proto_req).await?; - Ok(response.into_inner().try_into()?) - } - - /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't - /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. - /// - /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be - /// acquired because the LSN has already been garbage collected. - pub async fn lease_lsn( - &mut self, - req: model::LeaseLsnRequest, - ) -> Result { - let req = proto::LeaseLsnRequest::from(req); - Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) - } -} diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 4b7a70504a..c14bb73136 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -326,7 +326,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let inner = page_api::Client::new( + let inner = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index a297819e9b..f14caf548c 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -625,7 +625,7 @@ impl GrpcClient { ttid: TenantTimelineId, compression: bool, ) -> anyhow::Result { - let mut client = page_api::Client::new( + let mut client = page_api::Client::connect( connstring.to_string(), ttid.tenant_id, ttid.timeline_id, From 3dad4698ec134e5e5491fa7c9c5195041216e248 Mon Sep 17 00:00:00 2001 From: HaoyuHuang Date: Tue, 8 Jul 2025 12:43:01 -0700 Subject: [PATCH 10/28] PS changes #1 (#12467) # TLDR All changes are no-op except 1. publishing additional metrics. 2. problem VI ## Problem I It has come to my attention that the Neon Storage Controller doesn't correctly update its "observed" state of tenants previously associated with PSs that has come back up after a local data loss. It would still think that the old tenants are still attached to page servers and won't ask more questions. The pageserver has enough information from the reattach request/response to tell that something is wrong, but it doesn't do anything about it either. We need to detect this situation in production while I work on a fix. (I think there is just some misunderstanding about how Neon manages their pageserver deployments which got me confused about all the invariants.) ## Summary of changes I Added a `pageserver_local_data_loss_suspected` gauge metric that will be set to 1 if we detect a problematic situation from the reattch response. The problematic situation is when the PS doesn't have any local tenants but received a reattach response containing tenants. We can set up an alert using this metric. The alert should be raised whenever this metric reports non-zero number. Also added a HTTP PUT `http://pageserver/hadron-internal/reset_alert_gauges` API on the pageserver that can be used to reset the gauge and the alert once we manually rectify the situation (by restarting the HCC). ## Problem II Azure upload is 3x slower than AWS. -> 3x slower ingestion. The reason for the slower upload is that Azure upload in page server is much slower => higher flush latency => higher disk consistent LSN => higher back pressure. ## Summary of changes II Use Azure put_block API to uploads a 1 GB layer file in 8 blocks in parallel. I set the put_block block size to be 128 MB by default in azure config. To minimize neon changes, upload function passes the layer file path to the azure upload code through the storage metadata. This allows the azure put block to use FileChunkStreamRead to stream read from one partition in the file instead of loading all file data in memory and split it into 8 128 MB chunks. ## How is this tested? II 1. rust test_real_azure tests the put_block change. 3. I deployed the change in azure dev and saw flush latency reduces from ~30 seconds to 10 seconds. 4. I also did a bunch of stress test using sqlsmith and 100 GB TPCDS runs. ## Problem III Currently Neon limits the compaction tasks as 3/4 * CPU cores. This limits the overall compaction throughput and it can easily cause head-of-the-line blocking problems when a few large tenants are compacting. ## Summary of changes III This PR increases the limit of compaction tasks as `BG_TASKS_PER_THREAD` (default 4) * CPU cores. Note that `CONCURRENT_BACKGROUND_TASKS` also limits some other tasks `logical_size_calculation` and `layer eviction` . But compaction should be the most frequent and time-consuming task. ## Summary of changes IV This PR adds the following PageServer metrics: 1. `pageserver_disk_usage_based_eviction_evicted_bytes_total`: captures the total amount of bytes evicted. It's more straightforward to see the bytes directly instead of layers. 2. `pageserver_active_storage_operations_count`: captures the active storage operation, e.g., flush, L0 compaction, image creation etc. It's useful to visualize these active operations to get a better idea of what PageServers are spending cycles on in the background. ## Summary of changes V When investigating data corruptions, it's useful to search the base image and all WAL records of a page up to an LSN, i.e., a breakdown of GetPage@LSN request. This PR implements this functionality with two tools: 1. Extended `pagectl` with a new command to search the layer files for a given key up to a given LSN from the `index_part.json` file. The output can be used to download the files from S3 and then search the file contents using the second tool. Example usage: ``` cargo run --bin pagectl index-part search --tenant-id 09b99ea3239bbb3b2d883a59f087659d --timeline-id 7bedf4a6995baff7c0421ff9aebbcdab --path ~/Downloads/corruption/index_part.json-0000000c-formatted --key 000000067F000080140000802100000D61BD --lsn 70C/BF3D61D8 ``` Example output: ``` tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F0000801400008028000002FEFF__000007089F0B5381-0000070C7679EEB9-0000000c tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000000000000000000000000000000000-000000067F0000801400008028000002F3F1__000006DD95B6F609-000006E2BA14C369-0000000c tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F000080140000802100001B0973__000006D33429F539-000006DD95B6F609-0000000c tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000164D81__000006C6343B2D31-000006D33429F539-0000000b tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F0000801400008021000017687B__000006BA344FA7F1-000006C6343B2D31-0000000b tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000165BAB__000006AD34613D19-000006BA344FA7F1-0000000b tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000137A39__0000069F34773461-000006AD34613D19-0000000b tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F000080140000802100000D4000-000000067F000080140000802100000F0000__0000069F34773460-0000000b ``` 2. Added a unit test to search the layer file contents. It's not implemented part of `pagectl` because it depends on some test harness code, which can only be used by unit tests. Example usage: ``` cargo test --package pageserver --lib -- tenant::debug::test_search_key --exact --nocapture -- --tenant-id 09b99ea3239bbb3b2d883a59f087659d --timeline-id 7bedf4a6995baff7c0421ff9aebbcdab --data-dir /Users/chen.luo/Downloads/corruption --key 000000067F000080140000802100000D61BD --lsn 70C/BF3D61D8 ``` Example output: ``` # omitted image for brievity delta: 69F/769D8180: will_init: false, "OgAAALGkuwXwYp12nwYAAECGAAASIqLHAAAAAH8GAAAUgAAAIYAAAL1hDQD/DLGkuwUDAAAAEAAWAA==" delta: 69F/769CB6D8: will_init: false, "PQAAALGkuwXotZx2nwYAABAJAAAFk7tpACAGAH8GAAAUgAAAIYAAAL1hDQD/CQUAEAASALExuwUBAAAAAA==" ``` ## Problem VI Currently when page service resolves shards from page numbers, it doesn't fully support the case that the shard could be split in the middle. This will lead to query failures during the tenant split for either commit or abort cases (it's mostly for abort). ## Summary of changes VI This PR adds retry logic in `Cache::get()` to deal with shard resolution errors more gracefully. Specifically, it'll clear the cache and retry, instead of failing the query immediately. It also reduces the internal timeout to make retries faster. The PR also fixes a very obvious bug in `TenantManager::resolve_attached_shard` where the code tries to cache the computed the shard number, but forgot to recompute when the shard count is different. --------- Co-authored-by: William Huang Co-authored-by: Haoyu Huang Co-authored-by: Chen Luo Co-authored-by: Vlad Lazar Co-authored-by: Vlad Lazar --- Cargo.lock | 3 + libs/remote_storage/Cargo.toml | 3 + libs/remote_storage/src/azure_blob.rs | 143 ++++++- libs/remote_storage/src/config.rs | 18 + libs/remote_storage/tests/common/mod.rs | 34 +- libs/remote_storage/tests/test_real_azure.rs | 3 + pageserver/Cargo.toml | 1 + pageserver/ctl/src/index_part.rs | 102 ++++- pageserver/src/disk_usage_eviction_task.rs | 3 + pageserver/src/http/routes.rs | 15 + pageserver/src/metrics.rs | 91 ++++- pageserver/src/page_service.rs | 6 +- pageserver/src/tenant.rs | 16 +- pageserver/src/tenant/debug.rs | 366 ++++++++++++++++++ pageserver/src/tenant/mgr.rs | 17 +- .../tenant/remote_timeline_client/upload.rs | 22 +- pageserver/src/tenant/tasks.rs | 15 + pageserver/src/tenant/timeline.rs | 2 +- pageserver/src/tenant/timeline/handle.rs | 49 ++- pageserver/src/walredo.rs | 70 ++-- test_runner/fixtures/metrics.py | 3 + .../fixtures/pageserver/allowed_errors.py | 8 + test_runner/regress/test_sharding.py | 168 ++++++++ 23 files changed, 1097 insertions(+), 61 deletions(-) create mode 100644 pageserver/src/tenant/debug.rs diff --git a/Cargo.lock b/Cargo.lock index 237defaec3..39c43d94a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4339,6 +4339,7 @@ dependencies = [ "arc-swap", "async-compression", "async-stream", + "base64 0.22.1", "bincode", "bit_field", "byteorder", @@ -5684,6 +5685,8 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", + "byteorder", "bytes", "camino", "camino-tempfile", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index bd18d80915..69316fd493 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -13,6 +13,7 @@ aws-smithy-async.workspace = true aws-smithy-types.workspace = true aws-config.workspace = true aws-sdk-s3.workspace = true +base64.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true @@ -41,6 +42,8 @@ http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } +byteorder = "1.4" + [dev-dependencies] camino-tempfile.workspace = true test-context.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index e9c24ac723..db30829216 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow}; use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions}; use azure_storage::StorageCredentials; -use azure_storage_blobs::blob::operations::GetBlobBuilder; +use azure_storage_blobs::blob::BlobBlockType; +use azure_storage_blobs::blob::BlockList; use azure_storage_blobs::blob::{Blob, CopyStatus}; use azure_storage_blobs::container::operations::ListBlobsBuilder; -use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient}; +use azure_storage_blobs::prelude::ClientBuilder; +use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient}; +use base64::{Engine as _, engine::general_purpose::URL_SAFE}; +use byteorder::{BigEndian, ByteOrder}; use bytes::Bytes; +use camino::Utf8Path; use futures::FutureExt; use futures::future::Either; use futures::stream::Stream; use futures_util::{StreamExt, TryStreamExt}; use http_types::{StatusCode, Url}; use scopeguard::ScopeGuard; +use tokio::fs::File; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; use tokio_util::sync::CancellationToken; use tracing::debug; use utils::backoff; @@ -51,6 +59,9 @@ pub struct AzureBlobStorage { // Alternative timeout used for metadata objects which are expected to be small pub small_timeout: Duration, + /* BEGIN_HADRON */ + pub put_block_size_mb: Option, + /* END_HADRON */ } impl AzureBlobStorage { @@ -107,6 +118,9 @@ impl AzureBlobStorage { concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, small_timeout, + /* BEGIN_HADRON */ + put_block_size_mb: azure_config.put_block_size_mb, + /* END_HADRON */ }) } @@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage { let started_at = start_measuring_requests(kind); - let op = async { + let mut metadata_map = metadata.unwrap_or([].into()); + let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block"); + + /* BEGIN_HADRON */ + let op = async move { let blob_client = self.client.blob_client(self.relative_path_to_name(to)); + let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024; + if timeline_file_path.is_none() || put_block_size == 0 { + // Use put_block_blob directly. + let from: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(from); + let from = NonSeekableStream::new(from, data_size_bytes); + let body = azure_core::Body::SeekableStream(Box::new(from)); - let from: Pin> + Send + Sync + 'static>> = - Box::pin(from); + let mut builder = blob_client.put_block_blob(body); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } + let fut = builder.into_future(); + let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + match result { + Ok(Ok(_response)) => return Ok(()), + Ok(Err(azure)) => return Err(azure.into()), + Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()), + }; + } + // Upload chunks concurrently using Put Block. + // Each PutBlock uploads put_block_size bytes of the file. + let mut upload_futures: Vec>> = + vec![]; + let mut block_list = BlockList::default(); + let mut start_bytes = 0u64; + let mut remaining_bytes = data_size_bytes; + let mut block_list_count = 0; - let from = NonSeekableStream::new(from, data_size_bytes); + while remaining_bytes > 0 { + let block_size = std::cmp::min(remaining_bytes, put_block_size); + let end_bytes = start_bytes + block_size as u64; + let block_id = block_list_count; + let timeout = self.timeout; + let blob_client = blob_client.clone(); + let timeline_file = timeline_file_path.clone().unwrap().clone(); - let body = azure_core::Body::SeekableStream(Box::new(from)); + let mut encoded_block_id = [0u8; 8]; + BigEndian::write_u64(&mut encoded_block_id, block_id); + URL_SAFE.encode(encoded_block_id); - let mut builder = blob_client.put_block_blob(body); + // Put one block. + let part_fut = async move { + let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?; + file.seek(io::SeekFrom::Start(start_bytes)).await?; + let limited_reader = file.take(block_size as u64); + let file_chunk_stream = + tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024); + let file_chunk_stream_pin: Pin< + Box> + Send + Sync + 'static>, + > = Box::pin(file_chunk_stream); + let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size); + let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper)); + // Azure put block takes URL-encoded block ids and all blocks must have the same byte length. + // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters + let builder = blob_client.put_block(encoded_block_id.to_vec(), body); + let fut = builder.into_future(); + let fut = tokio::time::timeout(timeout, fut); + let result = fut.await; + tracing::debug!( + "azure put block id-{} size {} start {} end {} file {} response {:#?}", + block_id, + block_size, + start_bytes, + end_bytes, + timeline_file, + result + ); + match result { + Ok(Ok(_response)) => Ok(()), + Ok(Err(azure)) => Err(azure), + Err(_timeout) => Err(azure_core::Error::new( + azure_core::error::ErrorKind::Io, + std::io::Error::new( + std::io::ErrorKind::TimedOut, + "Operation timed out", + ), + )), + } + }; + upload_futures.push(tokio::spawn(part_fut)); - if let Some(metadata) = metadata { - builder = builder.metadata(to_azure_metadata(metadata)); + block_list_count += 1; + remaining_bytes -= block_size; + start_bytes += block_size as u64; + + block_list + .blocks + .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into())); } + tracing::debug!( + "azure put blocks {} total MB: {} chunk size MB: {}", + block_list_count, + data_size_bytes / 1024 / 1024, + put_block_size / 1024 / 1024 + ); + // Wait for all blocks to be uploaded. + let upload_results = futures::future::try_join_all(upload_futures).await; + if upload_results.is_err() { + return Err(anyhow::anyhow!(format!( + "Failed to upload all blocks {:#?}", + upload_results.unwrap_err() + ))); + } + + // Commit the blocks. + let mut builder = blob_client.put_block_list(block_list); + if !metadata_map.0.is_empty() { + builder = builder.metadata(to_azure_metadata(metadata_map)); + } let fut = builder.into_future(); let fut = tokio::time::timeout(self.timeout, fut); + let result = fut.await; + tracing::debug!("azure put block list response {:#?}", result); - match fut.await { + match result { Ok(Ok(_response)) => Ok(()), Ok(Err(azure)) => Err(azure.into()), Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()), } }; + /* END_HADRON */ let res = tokio::select! { res = op => res, @@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage { crate::metrics::BUCKET_METRICS .req_seconds .observe_elapsed(kind, outcome, started_at); - res } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 5bc1f678ae..e13e17d544 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -195,8 +195,19 @@ pub struct AzureConfig { pub max_keys_per_list_response: Option, #[serde(default = "default_azure_conn_pool_size")] pub conn_pool_size: usize, + /* BEGIN_HADRON */ + #[serde(default = "default_azure_put_block_size_mb")] + pub put_block_size_mb: Option, + /* END_HADRON */ } +/* BEGIN_HADRON */ +fn default_azure_put_block_size_mb() -> Option { + // Disable parallel upload by default. + Some(0) +} +/* END_HADRON */ + fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize { NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap() } @@ -213,6 +224,9 @@ impl Debug for AzureConfig { "max_keys_per_list_response", &self.max_keys_per_list_response, ) + /* BEGIN_HADRON */ + .field("put_block_size_mb", &self.put_block_size_mb) + /* END_HADRON */ .finish() } } @@ -352,6 +366,7 @@ timeout = '5s'"; upload_storage_class = 'INTELLIGENT_TIERING' timeout = '7s' conn_pool_size = 8 + put_block_size_mb = 1024 "; let config = parse(toml).unwrap(); @@ -367,6 +382,9 @@ timeout = '5s'"; concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1024), + /* END_HADRON */ }), timeout: Duration::from_secs(7), small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs index daab05d91a..fb7d6fd482 100644 --- a/libs/remote_storage/tests/common/mod.rs +++ b/libs/remote_storage/tests/common/mod.rs @@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data( let (data, data_len) = upload_stream(format!("remote blob data {i}").into_bytes().into()); + + /* BEGIN_HADRON */ + let mut metadata = None; + if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) { + let file_path = "/tmp/dbx_upload_tmp_file.txt"; + { + // Open the file in append mode + let mut file = std::fs::OpenOptions::new() + .append(true) + .create(true) // Create the file if it doesn't exist + .open(file_path)?; + // Append some bytes to the file + std::io::Write::write_all( + &mut file, + &format!("remote blob data {i}").into_bytes(), + )?; + file.sync_all()?; + } + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + file_path, + )])); + } + /* END_HADRON */ + task_client - .upload(data, data_len, &blob_path, None, &cancel) + .upload(data, data_len, &blob_path, metadata, &cancel) .await?; + // TODO: Check upload is using the put_block upload. + // We cannot consume data here since data is moved inside the upload. + // let total_bytes = data.fold(0, |acc, chunk| async move { + // acc + chunk.map(|bytes| bytes.len()).unwrap_or(0) + // }).await; + // assert_eq!(total_bytes, data_len); + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) }); } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 31c9ca3200..4d7caabd39 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -219,6 +219,9 @@ async fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, conn_pool_size: 8, + /* BEGIN_HADRON */ + put_block_size_mb: Some(1), + /* END_HADRON */ }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8a2e2ed3be..1fd0dccff0 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -112,6 +112,7 @@ twox-hash.workspace = true procfs.workspace = true [dev-dependencies] +base64.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index 6cce2844c7..838d00e490 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,10 +1,101 @@ +use std::str::FromStr; + use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::IndexPart; +use pageserver::tenant::{ + IndexPart, + layer_map::{LayerMap, SearchResult}, + remote_timeline_client::remote_layer_path, + storage_layer::{PersistentLayerDesc, ReadableLayerWeak}, +}; +use pageserver_api::key::Key; +use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::TenantShardId, +}; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { - Dump { path: Utf8PathBuf }, + Dump { + path: Utf8PathBuf, + }, + /// Find all layers that need to be searched to construct the given page at the given LSN. + Search { + #[arg(long)] + tenant_id: String, + #[arg(long)] + timeline_id: String, + #[arg(long)] + path: Utf8PathBuf, + #[arg(long)] + key: String, + #[arg(long)] + lsn: String, + }, +} + +async fn search_layers( + tenant_id: &str, + timeline_id: &str, + path: &Utf8PathBuf, + key: &str, + lsn: &str, +) -> anyhow::Result<()> { + let tenant_id = TenantId::from_str(tenant_id).unwrap(); + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let timeline_id = TimelineId::from_str(timeline_id).unwrap(); + let index_json = { + let bytes = tokio::fs::read(path).await?; + IndexPart::from_json_bytes(&bytes).unwrap() + }; + let mut layer_map = LayerMap::default(); + { + let mut updates = layer_map.batch_update(); + for (key, value) in index_json.layer_metadata.iter() { + updates.insert_historic(PersistentLayerDesc::from_filename( + tenant_shard_id, + timeline_id, + key.clone(), + value.file_size, + )); + } + } + let key = Key::from_hex(key)?; + + let lsn = Lsn::from_str(lsn).unwrap(); + let mut end_lsn = lsn; + loop { + let result = layer_map.search(key, end_lsn); + match result { + Some(SearchResult { layer, lsn_floor }) => { + let disk_layer = match layer { + ReadableLayerWeak::PersistentLayer(layer) => layer, + ReadableLayerWeak::InMemoryLayer(_) => { + anyhow::bail!("unexpected in-memory layer") + } + }; + + let metadata = index_json + .layer_metadata + .get(&disk_layer.layer_name()) + .unwrap(); + println!( + "{}", + remote_layer_path( + &tenant_id, + &timeline_id, + metadata.shard, + &disk_layer.layer_name(), + metadata.generation + ) + ); + end_lsn = lsn_floor; + } + None => break, + } + } + Ok(()) } pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { @@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { println!("{output}"); Ok(()) } + IndexPartCmd::Search { + tenant_id, + timeline_id, + path, + key, + lsn, + } => search_layers(tenant_id, timeline_id, path, key, lsn).await, } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f13b3709f5..e6529fb201 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -458,6 +458,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( match next { Ok(Ok(file_size)) => { METRICS.layers_evicted.inc(); + /*BEGIN_HADRON */ + METRICS.bytes_evicted.inc_by(file_size); + /*END_HADRON */ usage_assumed.add_available_bytes(file_size); } Ok(Err(( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 55582659df..0e40dbcd15 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -61,6 +61,7 @@ use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; use crate::feature_resolver::FeatureResolver; +use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; @@ -3628,6 +3629,17 @@ async fn activate_post_import_handler( .await } +// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts +// after we manually rectify situations such as local SSD data loss. We will eventually automate this. +async fn hadron_reset_alert_gauges( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + LOCAL_DATA_LOSS_SUSPECTED.set(0); + json_response(StatusCode::OK, ()) +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -4154,5 +4166,8 @@ pub fn make_router( .post("/v1/feature_flag_spec", |r| { api_handler(r, update_feature_flag_spec) }) + .post("/hadron-internal/reset_alert_gauges", |r| { + api_handler(r, hadron_reset_alert_gauges) + }) .any(handler_404)) } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 21faceef49..eb89e166b2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1,3 +1,4 @@ +use std::cell::Cell; use std::collections::HashMap; use std::num::NonZeroUsize; use std::os::fd::RawFd; @@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); -// Buckets for background operation duration in seconds, like compaction, GC, size calculation. +/* BEGIN_HADRON */ +pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_active_storage_operations_count", + "Count of active storage operations with operation, tenant and timeline dimensions", + &["operation", "tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); +/*END_HADRON */ + +// Buckets for background operations like compaction, GC, size calculation const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { @@ -2810,6 +2822,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); +pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy = Lazy::new(|| { + register_int_gauge!( + "pageserver_local_data_loss_suspected", + "Non-zero value indicates that pageserver local data loss is suspected (and highly likely)." + ) + .expect("failed to define a metric") +}); + +// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish +// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by +// Postgres compute to fetch data from pageservers. +// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the +// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify +// issues with compute configuration, caused by either the compute node itself being stuck in the wrong +// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration +// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests +// are indicative of bugs (and unavailability). +pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_misrouted_pagestream_requests_total", + "Number of pageserver pagestream requests that were routed to the wrong pageserver" + ) + .expect("failed to define a metric") +}); + // Metrics collected on WAL redo operations // // We collect the time spent in actual WAL redo ('redo'), and time waiting @@ -3048,13 +3085,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy = pub(crate) struct StorageTimeMetricsTimer { metrics: StorageTimeMetrics, start: Instant, + stopped: Cell, } impl StorageTimeMetricsTimer { fn new(metrics: StorageTimeMetrics) -> Self { + /*BEGIN_HADRON */ + // record the active operation as the timer starts + metrics.timeline_active_count.inc(); + /*END_HADRON */ Self { metrics, start: Instant::now(), + stopped: Cell::new(false), } } @@ -3070,6 +3113,10 @@ impl StorageTimeMetricsTimer { self.metrics.timeline_sum.inc_by(seconds); self.metrics.timeline_count.inc(); self.metrics.global_histogram.observe(seconds); + /* BEGIN_HADRON*/ + self.stopped.set(true); + self.metrics.timeline_active_count.dec(); + /*END_HADRON */ duration } @@ -3080,6 +3127,16 @@ impl StorageTimeMetricsTimer { } } +/*BEGIN_HADRON */ +impl Drop for StorageTimeMetricsTimer { + fn drop(&mut self) { + if !self.stopped.get() { + self.metrics.timeline_active_count.dec(); + } + } +} +/*END_HADRON */ + pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option); impl Drop for AlwaysRecordingStorageTimeMetricsTimer { @@ -3105,6 +3162,10 @@ pub(crate) struct StorageTimeMetrics { timeline_sum: Counter, /// Number of oeprations, per operation, tenant_id and timeline_id timeline_count: IntCounter, + /*BEGIN_HADRON */ + /// Number of active operations per operation, tenant_id, and timeline_id + timeline_active_count: IntGauge, + /*END_HADRON */ /// Global histogram having only the "operation" label. global_histogram: Histogram, } @@ -3124,6 +3185,11 @@ impl StorageTimeMetrics { let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) .unwrap(); + /*BEGIN_HADRON */ + let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE + .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id]) + .unwrap(); + /*END_HADRON */ let global_histogram = STORAGE_TIME_GLOBAL .get_metric_with_label_values(&[operation]) .unwrap(); @@ -3131,6 +3197,7 @@ impl StorageTimeMetrics { StorageTimeMetrics { timeline_sum, timeline_count, + timeline_active_count, global_histogram, } } @@ -3544,6 +3611,14 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + /* BEGIN_HADRON */ + let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[ + op, + tenant_id, + shard_id, + timeline_id, + ]); + /*END_HADRON */ } for op in StorageIoSizeOperation::VARIANTS { @@ -4336,6 +4411,9 @@ pub(crate) mod disk_usage_based_eviction { pub(crate) layers_collected: IntCounter, pub(crate) layers_selected: IntCounter, pub(crate) layers_evicted: IntCounter, + /*BEGIN_HADRON */ + pub(crate) bytes_evicted: IntCounter, + /*END_HADRON */ } impl Default for Metrics { @@ -4372,12 +4450,21 @@ pub(crate) mod disk_usage_based_eviction { ) .unwrap(); + /*BEGIN_HADRON */ + let bytes_evicted = register_int_counter!( + "pageserver_disk_usage_based_eviction_evicted_bytes_total", + "Amount of bytes successfully evicted" + ) + .unwrap(); + /*END_HADRON */ + Self { tenant_collection_time, tenant_layer_count, layers_collected, layers_selected, layers_evicted, + bytes_evicted, } } } @@ -4497,6 +4584,7 @@ pub fn preinitialize_metrics( &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, + &MISROUTED_PAGESTREAM_REQUESTS, ] .into_iter() .for_each(|c| { @@ -4534,6 +4622,7 @@ pub fn preinitialize_metrics( // gauges WALRECEIVER_ACTIVE_MANAGERS.get(); + LOCAL_DATA_LOSS_SUSPECTED.get(); // histograms [ diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 49928a9036..6b614deac8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -70,7 +70,7 @@ use crate::context::{ }; use crate::metrics::{ self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, - SmgrOpTimer, TimelineMetrics, + MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::{LsnRange, Version}; use crate::span::{ @@ -91,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation}; /// is not yet in state [`TenantState::Active`]. /// /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`]. -const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); +/// HADRON: reduced timeout and we will retry in Cache::get(). +const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000); /// Threshold at which to log slow GetPage requests. const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30); @@ -1128,6 +1129,7 @@ impl PageServerHandler { // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration // and talk to a different pageserver. + MISROUTED_PAGESTREAM_REQUESTS.inc(); return respond_error!( span, PageStreamError::Reconnect( diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 96ed4672a6..b0969a96c1 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -142,6 +142,9 @@ mod gc_block; mod gc_result; pub(crate) mod throttle; +#[cfg(test)] +pub mod debug; + pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline}; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -6015,12 +6018,11 @@ pub(crate) mod harness { } #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] - pub(crate) async fn do_try_load( + pub(crate) async fn do_try_load_with_redo( &self, + walredo_mgr: Arc, ctx: &RequestContext, ) -> anyhow::Result> { - let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); - let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( @@ -6058,6 +6060,14 @@ pub(crate) mod harness { Ok(tenant) } + pub(crate) async fn do_try_load( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); + self.do_try_load_with_redo(walredo_mgr, ctx).await + } + pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf { self.conf.timeline_path(&self.tenant_shard_id, timeline_id) } diff --git a/pageserver/src/tenant/debug.rs b/pageserver/src/tenant/debug.rs new file mode 100644 index 0000000000..604f7f265e --- /dev/null +++ b/pageserver/src/tenant/debug.rs @@ -0,0 +1,366 @@ +use std::{ops::Range, str::FromStr, sync::Arc}; + +use crate::walredo::RedoAttemptType; +use base64::{Engine as _, engine::general_purpose::STANDARD}; +use bytes::{Bytes, BytesMut}; +use camino::Utf8PathBuf; +use clap::Parser; +use itertools::Itertools; +use pageserver_api::{ + key::Key, + keyspace::KeySpace, + shard::{ShardIdentity, ShardStripeSize}, +}; +use postgres_ffi::PgMajorVersion; +use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn}; +use tracing::Instrument; +use utils::{ + generation::Generation, + id::{TenantId, TimelineId}, + lsn::Lsn, + shard::{ShardCount, ShardIndex, ShardNumber}, +}; +use wal_decoder::models::record::NeonWalRecord; + +use crate::{ + context::{DownloadBehavior, RequestContext}, + task_mgr::TaskKind, + tenant::storage_layer::ValueReconstructState, + walredo::harness::RedoHarness, +}; + +use super::{ + WalRedoManager, WalredoManagerId, + harness::TenantHarness, + remote_timeline_client::LayerFileMetadata, + storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState}, +}; + +fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes { + // To match the logic in libs/wal_decoder/src/serialized_batch.rs + let mut new_image: BytesMut = img_bytes.into(); + if is_fpw && !page_is_new(&new_image) { + page_set_lsn(&mut new_image, next_record_lsn); + } + assert_eq!(new_image.len(), BLCKSZ as usize); + new_image.freeze() +} + +async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::unsharded(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + let contents = tokio::fs::read_to_string(input) + .await + .map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}"))) + .unwrap(); + let lines = contents.lines(); + let mut last_wal_lsn: Option = None; + let state = { + let mut state = ValueReconstructState::default(); + let mut is_fpw = false; + let mut is_first_line = true; + for line in lines { + if is_first_line { + is_first_line = false; + if line.trim() == "FPW" { + is_fpw = true; + } + continue; // Skip the first line. + } + // Each input line is in the "," format. + let (lsn_str, payload_b64) = line + .split_once(',') + .expect("Invalid input format: expected ','"); + + // Parse the LSN and decode the payload. + let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format"); + let bytes = Bytes::from( + STANDARD + .decode(payload_b64.trim()) + .expect("Invalid base64 payload"), + ); + + // The first line is considered the base image, the rest are WAL records. + if state.img.is_none() { + state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes))); + } else { + let wal_record = NeonWalRecord::Postgres { + will_init: false, + rec: bytes, + }; + state.records.push((lsn, wal_record)); + last_wal_lsn.replace(lsn); + } + } + state + }; + + assert!(state.img.is_some(), "No base image found"); + assert!(!state.records.is_empty(), "No WAL records found"); + let result = timeline + .reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + + eprintln!("final image: {:?}", STANDARD.encode(result)); + + Ok(()) +} + +async fn search_key( + tenant_id: TenantId, + timeline_id: TimelineId, + dir: String, + key: Key, + lsn: Lsn, +) -> anyhow::Result<()> { + let shard_index = ShardIndex { + shard_number: ShardNumber(0), + shard_count: ShardCount(4), + }; + + let redo_harness = RedoHarness::new()?; + let span = redo_harness.span(); + let tenant_conf = pageserver_api::models::TenantConfig { + ..Default::default() + }; + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let tenant = TenantHarness::create_custom( + "search_key", + tenant_conf, + tenant_id, + ShardIdentity::new( + shard_index.shard_number, + shard_index.shard_count, + ShardStripeSize(32768), + ) + .unwrap(), + Generation::new(1), + ) + .await? + .do_try_load_with_redo( + Arc::new(WalRedoManager::Prod( + WalredoManagerId::next(), + redo_harness.manager, + )), + &ctx, + ) + .await + .unwrap(); + + let timeline = tenant + .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx) + .await?; + + let mut delta_layers: Vec = Vec::new(); + let mut img_layer: Option = Option::None; + let mut dir = tokio::fs::read_dir(dir).await?; + loop { + let entry = dir.next_entry().await?; + if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() { + break; + } + let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap(); + let layer_name = match LayerName::from_str(path.file_name().unwrap()) { + Ok(name) => name, + Err(_) => { + eprintln!("Skipped invalid layer: {path}"); + continue; + } + }; + let layer = Layer::for_resident( + tenant.conf, + &timeline, + path.clone(), + layer_name, + LayerFileMetadata::new( + tokio::fs::metadata(path.clone()).await?.len(), + Generation::new(1), + shard_index, + ), + ); + if layer.layer_desc().is_delta() { + delta_layers.push(layer.into()); + } else if img_layer.is_none() { + img_layer = Some(layer.into()); + } else { + anyhow::bail!("Found multiple image layers"); + } + } + // sort delta layers based on the descending order of LSN + delta_layers.sort_by(|a, b| { + b.layer_desc() + .get_lsn_range() + .start + .cmp(&a.layer_desc().get_lsn_range().start) + }); + + let mut state = ValuesReconstructState::new(IoConcurrency::Sequential); + + let key_space = KeySpace::single(Range { + start: key, + end: key.next(), + }); + let lsn_range = Range { + start: img_layer + .as_ref() + .map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()), + end: lsn, + }; + for delta_layer in delta_layers.iter() { + delta_layer + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + } + + img_layer + .as_ref() + .unwrap() + .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx) + .await?; + + for (_key, result) in std::mem::take(&mut state.keys) { + let state = result.collect_pending_ios().await?; + if state.img.is_some() { + eprintln!( + "image: {}: {:x?}", + state.img.as_ref().unwrap().0, + STANDARD.encode(state.img.as_ref().unwrap().1.clone()) + ); + } + for delta in state.records.iter() { + match &delta.1 { + NeonWalRecord::Postgres { will_init, rec } => { + eprintln!( + "delta: {}: will_init: {}, {:x?}", + delta.0, + will_init, + STANDARD.encode(rec) + ); + } + _ => { + eprintln!("delta: {}: {:x?}", delta.0, delta.1); + } + } + } + + let result = timeline + .reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage) + .instrument(span.clone()) + .await?; + eprintln!("final image: {lsn} : {result:?}"); + } + + Ok(()) +} + +/// Redo all WALs against the base image in the input file. Return the base64 encoded final image. +/// Each line in the input file must be in the form "," where: +/// * `` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`. +/// * `` is the base64‐encoded page image (first line) or WAL record (subsequent lines). +/// +/// The first line provides the base image of a page. The LSN is the LSN of "next record" following +/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping +/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200. +/// +/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the +/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here +/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should +/// be 0/1. +#[derive(Parser)] +struct RedoWalsCmd { + #[clap(long)] + input: String, + #[clap(long)] + key: String, +} + +#[tokio::test] +async fn test_redo_wals() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match RedoWalsCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let key = Key::from_hex(&cmd.key).unwrap(); + redo_wals(&cmd.input, key).await?; + + Ok(()) +} + +/// Search for a page at the given LSN in all layers of the data_dir. +/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image. +#[derive(Parser)] +struct SearchKeyCmd { + #[clap(long)] + tenant_id: String, + #[clap(long)] + timeline_id: String, + #[clap(long)] + data_dir: String, + #[clap(long)] + key: String, + #[clap(long)] + lsn: String, +} + +#[tokio::test] +async fn test_search_key() -> anyhow::Result<()> { + let args = std::env::args().collect_vec(); + let pos = args + .iter() + .position(|arg| arg == "--") + .unwrap_or(args.len()); + let slice = &args[pos..args.len()]; + let cmd = match SearchKeyCmd::try_parse_from(slice) { + Ok(cmd) => cmd, + Err(err) => { + eprintln!("{err}"); + return Ok(()); + } + }; + + let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap(); + let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap(); + let key = Key::from_hex(&cmd.key).unwrap(); + let lsn = Lsn::from_str(&cmd.lsn).unwrap(); + search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?; + + Ok(()) +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index be18b40862..15853d3614 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -43,7 +43,7 @@ use crate::controller_upcall_client::{ }; use crate::deletion_queue::DeletionQueueClient; use crate::http::routes::ACTIVE_TENANT_TIMEOUT; -use crate::metrics::{TENANT, TENANT_MANAGER as METRICS}; +use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS}; use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind}; use crate::tenant::config::{ AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig, @@ -538,6 +538,21 @@ pub async fn init_tenant_mgr( // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?; + // Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants. + // This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node + // so the Storage Controller has not had the time to move tenants out. + let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes { + tenant_configs.is_empty() && !tenant_modes.is_empty() + } else { + false + }; + if data_loss_suspected { + tracing::error!( + "Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants" + ); + } + LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 }); + tracing::info!( "Attaching {} tenants at startup, warming up {} at a time", tenant_configs.len(), diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs index ffb4717d9f..f2fbf656a6 100644 --- a/pageserver/src/tenant/remote_timeline_client/upload.rs +++ b/pageserver/src/tenant/remote_timeline_client/upload.rs @@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>( let fs_size = usize::try_from(fs_size) .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?; - + /* BEGIN_HADRON */ + let mut metadata = None; + match storage { + // Pass the file path as a storage metadata to minimize changes to neon. + // Otherwise, we need to change the upload interface. + GenericRemoteStorage::AzureBlob(s) => { + let block_size_mb = s.put_block_size_mb.unwrap_or(0); + if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 { + metadata = Some(remote_storage::StorageMetadata::from([( + "databricks_azure_put_block", + local_path.as_str(), + )])); + } + } + GenericRemoteStorage::LocalFs(_) => {} + GenericRemoteStorage::AwsS3(_) => {} + GenericRemoteStorage::Unreliable(_) => {} + }; + /* END_HADRON */ let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE); storage - .upload(reader, fs_size, remote_path, None, cancel) + .upload(reader, fs_size, remote_path, metadata, cancel) .await .with_context(|| format!("upload layer from local path '{local_path}'")) } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 356f495972..2ae6b7ff3d 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -34,6 +34,21 @@ use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work. static CONCURRENT_BACKGROUND_TASKS: Lazy = Lazy::new(|| { let total_threads = TOKIO_WORKER_THREADS.get(); + + /*BEGIN_HADRON*/ + // ideally we should run at least one compaction task per tenant in order to (1) maximize + // compaction throughput (2) avoid head-of-line blocking of large compactions. However doing + // that may create too many compaction tasks with lots of memory overheads. So we limit the + // number of compaction tasks based on the available CPU core count. + // Need to revisit. + // let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD") + // .ok() + // .and_then(|s| s.parse().ok()) + // .unwrap_or(4); + // let permits = usize::max(1, total_threads * tasks_per_thread); + // // assert!(permits < total_threads, "need threads for other work"); + /*END_HADRON*/ + let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0)); assert_ne!(permits, 0, "we will not be adding in permits later"); assert!(permits < total_threads, "need threads for other work"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 44a4f1e911..293f3c484d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -6742,7 +6742,7 @@ impl Timeline { } /// Reconstruct a value, using the given base image and WAL records in 'data'. - async fn reconstruct_value( + pub(crate) async fn reconstruct_value( &self, key: Key, request_lsn: Lsn, diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 2dbff20ab2..33c97287c0 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -212,8 +212,12 @@ //! to the parent shard during a shard split. Eventually, the shard split task will //! shut down the parent => case (1). -use std::collections::{HashMap, hash_map}; -use std::sync::{Arc, Mutex, Weak}; +use std::collections::HashMap; +use std::collections::hash_map; +use std::sync::Arc; +use std::sync::Mutex; +use std::sync::Weak; +use std::time::Duration; use pageserver_api::shard::ShardIdentity; use tracing::{instrument, trace}; @@ -333,6 +337,44 @@ enum RoutingResult { } impl Cache { + /* BEGIN_HADRON */ + /// A wrapper of do_get to resolve the tenant shard for a get page request. + #[instrument(level = "trace", skip_all)] + pub(crate) async fn get( + &mut self, + timeline_id: TimelineId, + shard_selector: ShardSelector, + tenant_manager: &T::TenantManager, + ) -> Result, GetError> { + const GET_MAX_RETRIES: usize = 10; + const RETRY_BACKOFF: Duration = Duration::from_millis(100); + let mut attempt = 0; + loop { + attempt += 1; + match self + .do_get(timeline_id, shard_selector, tenant_manager) + .await + { + Ok(handle) => return Ok(handle), + Err(e) => { + // Retry on tenant manager error to handle tenant split more gracefully + if attempt < GET_MAX_RETRIES { + tracing::warn!( + "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...", + attempt, + e + ); + tokio::time::sleep(RETRY_BACKOFF).await; + continue; + } else { + return Err(e); + } + } + } + } + } + /* END_HADRON */ + /// See module-level comment for details. /// /// Does NOT check for the shutdown state of [`Types::Timeline`]. @@ -341,7 +383,7 @@ impl Cache { /// and if so, return an error that causes the page service to /// close the connection. #[instrument(level = "trace", skip_all)] - pub(crate) async fn get( + async fn do_get( &mut self, timeline_id: TimelineId, shard_selector: ShardSelector, @@ -879,6 +921,7 @@ mod tests { .await .err() .expect("documented behavior: can't get new handle after shutdown"); + assert_eq!(cache.map.len(), 1, "next access cleans up the cache"); cache diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b17b5a15f9..c6d3cafe9a 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -566,22 +566,55 @@ impl PostgresRedoManager { } } +#[cfg(test)] +pub(crate) mod harness { + use super::PostgresRedoManager; + use crate::config::PageServerConf; + use utils::{id::TenantId, shard::TenantShardId}; + + pub struct RedoHarness { + // underscored because unused, except for removal at drop + _repo_dir: camino_tempfile::Utf8TempDir, + pub manager: PostgresRedoManager, + tenant_shard_id: TenantShardId, + } + + impl RedoHarness { + pub fn new() -> anyhow::Result { + crate::tenant::harness::setup_logging(); + + let repo_dir = camino_tempfile::tempdir()?; + let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); + let conf = Box::leak(Box::new(conf)); + let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + + Ok(RedoHarness { + _repo_dir: repo_dir, + manager, + tenant_shard_id, + }) + } + pub fn span(&self) -> tracing::Span { + tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) + } + } +} + #[cfg(test)] mod tests { use std::str::FromStr; use bytes::Bytes; use pageserver_api::key::Key; - use pageserver_api::shard::TenantShardId; use postgres_ffi::PgMajorVersion; use tracing::Instrument; - use utils::id::TenantId; use utils::lsn::Lsn; use wal_decoder::models::record::NeonWalRecord; - use super::PostgresRedoManager; - use crate::config::PageServerConf; use crate::walredo::RedoAttemptType; + use crate::walredo::harness::RedoHarness; #[tokio::test] async fn test_ping() { @@ -692,33 +725,4 @@ mod tests { ) ] } - - struct RedoHarness { - // underscored because unused, except for removal at drop - _repo_dir: camino_tempfile::Utf8TempDir, - manager: PostgresRedoManager, - tenant_shard_id: TenantShardId, - } - - impl RedoHarness { - fn new() -> anyhow::Result { - crate::tenant::harness::setup_logging(); - - let repo_dir = camino_tempfile::tempdir()?; - let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); - let conf = Box::leak(Box::new(conf)); - let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - Ok(RedoHarness { - _repo_dir: repo_dir, - manager, - tenant_shard_id, - }) - } - fn span(&self) -> tracing::Span { - tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()) - } - } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 1dd4fe8316..6e600b5a86 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = ( ) PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( + # BEGIN_HADRON + "pageserver_active_storage_operations_count", + # END_HADRON "pageserver_current_logical_size", "pageserver_resident_physical_size", "pageserver_io_operations_bytes_total", diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 6a715c4b93..0e4dd571c0 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,14 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + # Can happen in tests that purposely wipe pageserver "local disk" data. + ".*Local data loss suspected.*", + # Too many frozen layers error is normal during intensive benchmarks + ".*too many frozen layers.*", + # Transient errors when resolving tenant shards by page service + ".*Fail to resolve tenant shard in attempt.*", + # Expected warnings when pageserver has not refreshed GC info yet + ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*", ".*No broker updates received for a while.*", *( [ diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 93c621f564..8ff767eca4 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1,8 +1,11 @@ from __future__ import annotations import os +import random +import threading import time from collections import defaultdict +from threading import Event from typing import TYPE_CHECKING, Any import pytest @@ -1505,6 +1508,171 @@ def test_sharding_split_failures( env.storage_controller.consistency_check() +@pytest.mark.skip(reason="The backpressure change has not been merged yet.") +def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder): + """ + Test backpressure can ignore new shards during tenant split so that if we abort the split, + PG can continue without being blocked. + """ + DBNAME = "regression" + + init_shard_count = 4 + neon_env_builder.num_pageservers = init_shard_count + stripe_size = 32 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create( + "main", + config_lines=[ + "max_replication_write_lag = 1MB", + "databricks.max_wal_mb_per_second = 1", + "neon.max_cluster_size = 10GB", + ], + ) + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);") + write_done = Event() + + def write_data(write_done): + while not write_done.is_set(): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + log.info("write_data thread exiting") + + writer_thread = threading.Thread(target=write_data, args=(write_done,)) + writer_thread.start() + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + write_done.set() + writer_thread.join() + + # writing more data to page servers after split is aborted + for _i in range(5000): + endpoint.safe_psql( + "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False + ) + + # wait until write lag becomes 0 + def check_write_lag_is_zero(): + res = endpoint.safe_psql( + """ + SELECT + pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag + FROM neon.backpressure_lsns(); + """, + dbname="databricks_system", + log_query=False, + ) + log.info(f"received_lsn_lag = {res[0][0]}") + assert res[0][0] == 0 + + wait_until(check_write_lag_is_zero) + endpoint.stop_and_destroy() + + +# BEGIN_HADRON +def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder): + """ + Tests that page service is able to resolve the correct shard during tenant split without causing query errors + """ + DBNAME = "regression" + WORKER_THREADS = 16 + ROW_COUNT = 10000 + + init_shard_count = 4 + neon_env_builder.num_pageservers = 1 + stripe_size = 16 + + env = neon_env_builder.init_start( + initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size + ) + + env.storage_controller.allowed_errors.extend( + [ + # All split failures log a warning when then enqueue the abort operation + ".*Enqueuing background abort.*", + # Tolerate any error lots that mention a failpoint + ".*failpoint.*", + ] + ) + + endpoint = env.endpoints.create("main") + endpoint.respec(skip_pg_catalog_updates=False) # Needed for databricks_system to get created. + endpoint.start() + + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") + + # generate 10MB of data + endpoint.safe_psql( + f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;" + ) + read_done = Event() + + def read_data(read_done): + i = 0 + while not read_done.is_set() or i < 10: + endpoint.safe_psql( + f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}", + log_query=False, + ) + i += 1 + log.info(f"read_data thread exiting. Executed {i} queries.") + + reader_threads = [] + for _i in range(WORKER_THREADS): + reader_thread = threading.Thread(target=read_data, args=(read_done,)) + reader_thread.start() + reader_threads.append(reader_thread) + + env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)")) + # split the tenant + with pytest.raises(StorageControllerApiException): + env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16) + + # wait until abort is done + def check_tenant_status(): + active_count = 0 + for i in range(init_shard_count): + status = env.pageserver.http_client().tenant_status( + TenantShardId(env.initial_tenant, i, init_shard_count) + ) + if status["state"]["slug"] == "Active": + active_count += 1 + assert active_count == 4 + + wait_until(check_tenant_status) + + read_done.set() + for thread in reader_threads: + thread.join() + + endpoint.stop() + + +# END_HADRON + + def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder): """ Check a scenario when one of the shards is much slower than others. From 8223c1ba9d9806ad4bf267d418d065717d70a71f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 8 Jul 2025 22:58:18 +0200 Subject: [PATCH 11/28] pageserver/client_grpc: add initial gRPC client pools (#12434) ## Problem The communicator will need gRPC channel/client/stream pools for efficient reuse across many backends. Touches #11735. Requires #12396. ## Summary of changes Adds three nested resource pools: * `ChannelPool` for gRPC channels (i.e. TCP connections). * `ClientPool` for gRPC clients (i.e. `page_api::Client`). Acquires channels from `ChannelPool`. * `StreamPool` for gRPC GetPage streams. Acquires clients from `ClientPool`. These are minimal functional implementations that will need further improvements and performance optimization. However, the overall structure is expected to be roughly final, so reviews should focus on that. The pools are not yet in use, but will form the foundation of a rich gRPC Pageserver client used by the communicator (see #12462). This PR also adds the initial crate scaffolding for that client. See doc comments for details. --- Cargo.lock | 15 + Cargo.toml | 1 + pageserver/client_grpc/Cargo.toml | 16 + pageserver/client_grpc/src/lib.rs | 14 + pageserver/client_grpc/src/pool.rs | 586 +++++++++++++++++++++++++++++ 5 files changed, 632 insertions(+) create mode 100644 pageserver/client_grpc/Cargo.toml create mode 100644 pageserver/client_grpc/src/lib.rs create mode 100644 pageserver/client_grpc/src/pool.rs diff --git a/Cargo.lock b/Cargo.lock index 39c43d94a3..893932fb9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4493,6 +4493,21 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_client_grpc" +version = "0.1.0" +dependencies = [ + "anyhow", + "futures", + "pageserver_page_api", + "tokio", + "tokio-stream", + "tonic 0.13.1", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "pageserver_compaction" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 840e3c6036..14f2cfcb56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "pageserver/compaction", "pageserver/ctl", "pageserver/client", + "pageserver/client_grpc", "pageserver/pagebench", "pageserver/page_api", "proxy", diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml new file mode 100644 index 0000000000..5a3a2761c2 --- /dev/null +++ b/pageserver/client_grpc/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "pageserver_client_grpc" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +futures.workspace = true +pageserver_page_api.workspace = true +tokio.workspace = true +tokio-stream.workspace = true +tonic.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs new file mode 100644 index 0000000000..c900e1a939 --- /dev/null +++ b/pageserver/client_grpc/src/lib.rs @@ -0,0 +1,14 @@ +//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client` +//! gRPC client, and supports: +//! +//! * Sharded tenants across multiple Pageservers. +//! * Pooling of connections, clients, and streams for efficient resource use. +//! * Concurrent use by many callers. +//! * Internal handling of GetPage bidirectional streams. +//! * Automatic retries. +//! * Observability. +//! +//! The client is under development, this package is just a shell. + +#[allow(unused)] +mod pool; diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs new file mode 100644 index 0000000000..518e4e5b84 --- /dev/null +++ b/pageserver/client_grpc/src/pool.rs @@ -0,0 +1,586 @@ +//! This module provides various Pageserver gRPC client resource pools. +//! +//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across +//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency +//! of creating dedicated TCP connections and server tasks for every Postgres backend. +//! +//! Each resource has its own, nested pool. The pools are custom-built for the properties of each +//! resource -- they are different enough that a generic pool isn't suitable. +//! +//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients +//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a +//! per-channel client limit. Channels may be closed when they are no longer used by any clients. +//! +//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared) +//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a +//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed +//! from the pool after some time, to free up the channel. +//! +//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the +//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it +//! returns a guard that can be used to send a single request, to properly enforce queue depth and +//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request, +//! possibly pipelining multiple requests from multiple callers on the same stream (up to some +//! queue depth). Idle streams may be removed from the pool after a while to free up the client. +//! +//! Each channel corresponds to one TCP connection. Each client unary request and each stream +//! corresponds to one HTTP/2 stream and server task. +//! +//! TODO: error handling (including custom error types). +//! TODO: observability. + +use std::collections::{BTreeMap, HashMap}; +use std::ops::{Deref, DerefMut}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, Weak}; + +use futures::StreamExt as _; +use tokio::sync::mpsc::{Receiver, Sender}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use tonic::transport::{Channel, Endpoint}; +use tracing::{error, warn}; + +use pageserver_page_api as page_api; +use utils::id::{TenantId, TimelineId}; +use utils::shard::ShardIndex; + +/// Max number of concurrent clients per channel. +/// +/// TODO: tune these constants, and make them configurable. +/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels +/// with only streams. +const CLIENTS_PER_CHANNEL: usize = 16; + +/// Maximum number of concurrent clients per `ClientPool`. +const CLIENT_LIMIT: usize = 64; + +/// Max number of pipelined requests per gRPC GetPage stream. +const STREAM_QUEUE_DEPTH: usize = 2; + +/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 +/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of +/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle channels. +/// TODO: consider prewarming a set of channels, to avoid initial connection latency. +/// TODO: consider adding a circuit breaker for errors and fail fast. +pub struct ChannelPool { + /// Pageserver endpoint to connect to. + endpoint: Endpoint, + /// Open channels. + channels: Mutex>, + /// Channel ID generator. + next_channel_id: AtomicUsize, +} + +type ChannelID = usize; + +struct ChannelEntry { + /// The gRPC channel (i.e. TCP connection). Shared by multiple clients. + channel: Channel, + /// Number of clients using this channel. + clients: usize, +} + +impl ChannelPool { + /// Creates a new channel pool for the given Pageserver endpoint. + pub fn new(endpoint: E) -> anyhow::Result> + where + E: TryInto + Send + Sync + 'static, + >::Error: std::error::Error + Send + Sync, + { + Ok(Arc::new(Self { + endpoint: endpoint.try_into()?, + channels: Mutex::default(), + next_channel_id: AtomicUsize::default(), + })) + } + + /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel. + /// + /// This never blocks (except for mutex acquisition). The channel is connected lazily on first + /// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established + /// automatically on failure (TODO: verify). + /// + /// Callers should not clone the returned channel, and must hold onto the returned guard as long + /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf + /// client requires an owned `Channel` and we don't have access to the channel's internal + /// refcount. + /// + /// This is not performance-sensitive. It is only called when creating a new client, and clients + /// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n) + /// performance is therefore okay. + pub fn get(self: &Arc) -> ChannelGuard { + let mut channels = self.channels.lock().unwrap(); + + // Try to find an existing channel with available capacity. We check entries in BTreeMap + // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients + // with lower-ordered channel IDs first. This will cluster clients in lower-ordered + // channels, and free up higher-ordered channels such that they can be reaped. + for (&id, entry) in channels.iter_mut() { + assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow"); + if entry.clients < CLIENTS_PER_CHANNEL { + entry.clients += 1; + return ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(entry.channel.clone()), + }; + } + } + + // Create a new channel. We connect lazily on first use, such that we don't block here and + // other clients can join onto the same channel while it's connecting. + let channel = self.endpoint.connect_lazy(); + + let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed); + let entry = ChannelEntry { + channel: channel.clone(), + clients: 1, // account for the guard below + }; + channels.insert(id, entry); + + ChannelGuard { + pool: Arc::downgrade(self), + id, + channel: Some(channel), + } + } +} + +/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`, +/// since the gRPC client requires an owned `Channel`. +pub struct ChannelGuard { + pool: Weak, + id: ChannelID, + channel: Option, +} + +impl ChannelGuard { + /// Returns the inner owned channel. Panics if called more than once. The caller must hold onto + /// the guard as long as the channel is in use, and should not clone it. + pub fn take(&mut self) -> Channel { + self.channel.take().expect("channel already taken") + } +} + +/// Returns the channel to the pool. +impl Drop for ChannelGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let mut channels = pool.channels.lock().unwrap(); + let entry = channels.get_mut(&self.id).expect("unknown channel"); + assert!(entry.clients > 0, "channel underflow"); + entry.clients -= 1; + } +} + +/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner +/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total +/// number of concurrent clients to `CLIENT_LIMIT` via semaphore. +/// +/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. +/// +/// TODO: reap idle clients. +pub struct ClientPool { + /// Tenant ID. + tenant_id: TenantId, + /// Timeline ID. + timeline_id: TimelineId, + /// Shard ID. + shard_id: ShardIndex, + /// Authentication token, if any. + auth_token: Option, + /// Channel pool to acquire channels from. + channel_pool: Arc, + /// Limits the max number of concurrent clients for this pool. + limiter: Arc, + /// Idle pooled clients. Acquired clients are removed from here and returned on drop. + /// + /// The first client in the map will be acquired next. The map is sorted by client ID, which in + /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from + /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle + /// clients are reaped. + idle: Mutex>, + /// Unique client ID generator. + next_client_id: AtomicUsize, +} + +type ClientID = (ChannelID, usize); + +struct ClientEntry { + /// The pooled gRPC client. + client: page_api::Client, + /// The channel guard for the channel used by the client. + channel_guard: ChannelGuard, +} + +impl ClientPool { + /// Creates a new client pool for the given tenant shard. Channels are acquired from the given + /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. + pub fn new( + channel_pool: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + ) -> Arc { + Arc::new(Self { + tenant_id, + timeline_id, + shard_id, + auth_token, + channel_pool, + idle: Mutex::default(), + limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)), + next_client_id: AtomicUsize::default(), + }) + } + + /// Gets a client from the pool, or creates a new one if necessary. Connections are established + /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The + /// client is returned to the pool when the guard is dropped. + /// + /// This is moderately performance-sensitive. It is called for every unary request, but these + /// establish a new gRPC stream per request so they're already expensive. GetPage requests use + /// the `StreamPool` instead. + pub async fn get(self: &Arc) -> anyhow::Result { + let permit = self + .limiter + .clone() + .acquire_owned() + .await + .expect("never closed"); + + // Fast path: acquire an idle client from the pool. + if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() { + return Ok(ClientGuard { + pool: Arc::downgrade(self), + id, + client: Some(entry.client), + channel_guard: Some(entry.channel_guard), + permit, + }); + } + + // Slow path: construct a new client. + let mut channel_guard = self.channel_pool.get(); + let client = page_api::Client::new( + channel_guard.take(), + self.tenant_id, + self.timeline_id, + self.shard_id, + self.auth_token.clone(), + None, + )?; + + Ok(ClientGuard { + pool: Arc::downgrade(self), + id: ( + channel_guard.id, + self.next_client_id.fetch_add(1, Ordering::Relaxed), + ), + client: Some(client), + channel_guard: Some(channel_guard), + permit, + }) + } +} + +/// A client acquired from the pool. The inner client can be accessed via Deref. The client is +/// returned to the pool when dropped. +pub struct ClientGuard { + pool: Weak, + id: ClientID, + client: Option, // Some until dropped + channel_guard: Option, // Some until dropped + permit: OwnedSemaphorePermit, +} + +impl Deref for ClientGuard { + type Target = page_api::Client; + + fn deref(&self) -> &Self::Target { + self.client.as_ref().expect("not dropped") + } +} + +impl DerefMut for ClientGuard { + fn deref_mut(&mut self) -> &mut Self::Target { + self.client.as_mut().expect("not dropped") + } +} + +/// Returns the client to the pool. +impl Drop for ClientGuard { + fn drop(&mut self) { + let Some(pool) = self.pool.upgrade() else { + return; // pool was dropped + }; + let entry = ClientEntry { + client: self.client.take().expect("dropped once"), + channel_guard: self.channel_guard.take().expect("dropped once"), + }; + pool.idle.lock().unwrap().insert(self.id, entry); + + _ = self.permit; // returned on drop, referenced for visibility + } +} + +/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream +/// acquires a client from the inner `ClientPool` for the stream's lifetime. +/// +/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send +/// a single request and await the response. Internally, requests are multiplexed across streams and +/// channels. This allows proper queue depth enforcement and response routing. +/// +/// TODO: reap idle streams. +/// TODO: consider making this generic over request and response types; not currently needed. +pub struct StreamPool { + /// The client pool to acquire clients from. + client_pool: Arc, + /// All pooled streams. + /// + /// Incoming requests will be sent over an existing stream with available capacity. If all + /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit). + /// Each stream has an associated Tokio task that processes requests and responses. + streams: Arc>>, + /// Limits the max number of concurrent requests (not streams). + limiter: Arc, + /// Stream ID generator. + next_stream_id: AtomicUsize, +} + +type StreamID = usize; +type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>; +type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>; +type ResponseSender = oneshot::Sender>; + +struct StreamEntry { + /// Sends caller requests to the stream task. The stream task exits when this is dropped. + sender: RequestSender, + /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on + /// completion without acquiring the `StreamPool::streams` lock. + queue_depth: Arc, +} + +impl StreamPool { + /// Creates a new stream pool, using the given client pool. + /// + /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may + /// fill up the client pool and starve out unary requests. Client pools can share the same + /// `ChannelPool` though, since the channel pool is unbounded. + pub fn new(client_pool: Arc) -> Arc { + Arc::new(Self { + client_pool, + streams: Arc::default(), + limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)), + next_stream_id: AtomicUsize::default(), + }) + } + + /// Acquires an available stream from the pool, or spins up a new stream async if all streams + /// are full. Returns a guard that can be used to send a single request on the stream and await + /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity + /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). + /// + /// This is very performance-sensitive, as it is on the GetPage hot path. + /// + /// TODO: this must do something more sophisticated for performance. We want: + /// + /// * Cheap, concurrent access in the common case where we can use a pooled stream. + /// * Quick acquisition of pooled streams with available capacity. + /// * Prefer streams that belong to lower-numbered channels, to reap idle channels. + /// * Prefer filling up existing streams' queue depth before spinning up new streams. + /// * Don't hold a lock while spinning up new streams. + /// * Allow concurrent clients to join onto streams while they're spun up. + /// * Allow spinning up multiple streams concurrently, but don't overshoot limits. + /// + /// For now, we just do something simple and functional, but very inefficient (linear scan). + pub async fn get(&self) -> StreamGuard { + let permit = self + .limiter + .clone() + .acquire_owned() + .await + .expect("never closed"); + let mut streams = self.streams.lock().unwrap(); + + // Look for a pooled stream with available capacity. + for entry in streams.values() { + assert!( + entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH, + "stream queue overflow" + ); + if entry + .queue_depth + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { + // Increment the queue depth via compare-and-swap. + // TODO: review ordering. + (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1) + }) + .is_ok() + { + return StreamGuard { + sender: entry.sender.clone(), + queue_depth: entry.queue_depth.clone(), + permit, + }; + } + } + + // No available stream, spin up a new one. We install the stream entry in the pool first and + // return the guard, while spinning up the stream task async. This allows other callers to + // join onto this stream and also create additional streams concurrently if this fills up. + let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); + let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller + let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); + let entry = StreamEntry { + sender: req_tx.clone(), + queue_depth: queue_depth.clone(), + }; + streams.insert(id, entry); + + // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT * + // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more + // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen + // because we only acquire queue depth under lock and after acquiring a semaphore permit. + assert!(streams.len() <= CLIENT_LIMIT, "stream overflow"); + + let client_pool = self.client_pool.clone(); + let streams = self.streams.clone(); + + tokio::spawn(async move { + if let Err(err) = Self::run_stream(client_pool, req_rx).await { + error!("stream failed: {err}"); + } + // Remove stream from pool on exit. + let entry = streams.lock().unwrap().remove(&id); + assert!(entry.is_some(), "unknown stream ID: {id}"); + }); + + StreamGuard { + sender: req_tx, + queue_depth, + permit, + } + } + + /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a + /// bidirectional GetPage stream, then forwards requests and responses between callers and the + /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be + /// atomic with pool stream acquisition. + /// + /// The task exits when the request channel is closed, or on a stream error. The caller is + /// responsible for removing the stream from the pool on exit. + async fn run_stream( + client_pool: Arc, + mut caller_rx: RequestReceiver, + ) -> anyhow::Result<()> { + // Acquire a client from the pool and create a stream. + let mut client = client_pool.get().await?; + + let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); + let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); + let mut resp_stream = client.get_pages(req_stream).await?; + + // Track caller response channels by request ID. If the task returns early, these response + // channels will be dropped and the waiting callers will receive an error. + let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH); + + // Process requests and responses. + loop { + // NB: this can trip if the server doesn't respond to a request, so only debug_assert. + debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow"); + + tokio::select! { + // Receive requests from callers and send them to the stream. + req = caller_rx.recv() => { + // Shut down if request channel is closed. + let Some((req, resp_tx)) = req else { + return Ok(()); + }; + + // Store the response channel by request ID. + if callers.contains_key(&req.request_id) { + // Error on request ID duplicates. Ignore callers that went away. + _ = resp_tx.send(Err(tonic::Status::invalid_argument( + format!("duplicate request ID: {}", req.request_id), + ))); + continue; + } + callers.insert(req.request_id, resp_tx); + + // Send the request on the stream. Bail out if the send fails. + req_tx.send(req).await.map_err(|_| { + tonic::Status::unavailable("stream closed") + })?; + } + + // Receive responses from the stream and send them to callers. + resp = resp_stream.next() => { + // Shut down if the stream is closed, and bail out on stream errors. + let Some(resp) = resp.transpose()? else { + return Ok(()) + }; + + // Send the response to the caller. Ignore errors if the caller went away. + let Some(resp_tx) = callers.remove(&resp.request_id) else { + warn!("received response for unknown request ID: {}", resp.request_id); + continue; + }; + _ = resp_tx.send(Ok(resp)); + } + } + } + } +} + +/// A pooled stream reference. Can be used to send a single request, to properly enforce queue +/// depth. Queue depth is already reserved and will be returned on drop. +pub struct StreamGuard { + sender: RequestSender, + queue_depth: Arc, + permit: OwnedSemaphorePermit, +} + +impl StreamGuard { + /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only + /// valid for a single request (to enforce queue depth). This also drops the guard on return and + /// returns the queue depth quota to the pool. + /// + /// The `GetPageRequest::request_id` must be unique across in-flight requests. + /// + /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status` + /// to avoid tearing down the stream for per-request errors. Callers must check this. + pub async fn send( + self, + req: page_api::GetPageRequest, + ) -> tonic::Result { + let (resp_tx, resp_rx) = oneshot::channel(); + + self.sender + .send((req, resp_tx)) + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))?; + + resp_rx + .await + .map_err(|_| tonic::Status::unavailable("stream closed"))? + } +} + +impl Drop for StreamGuard { + fn drop(&mut self) { + // Release the queue depth reservation on drop. This can prematurely decrement it if dropped + // before the response is received, but that's okay. + let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst); + assert!(prev_queue_depth > 0, "stream queue underflow"); + + _ = self.permit; // returned on drop, referenced for visibility + } +} From 09ff22a4d42ee0c97724e61dc08f325b2ea39111 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:12:26 -0400 Subject: [PATCH 12/28] fix(compute): removing `NEON_EXT_INT_UPD` log statement added for debugging verbosity (#12509) Removes the `NEON_EXT_INT_UPD` log statement that was added for debugging verbosity. --- compute_tools/src/compute.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f25aff1110..ec6e6c1634 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -2433,19 +2433,11 @@ LIMIT 100", // If the value is -1, we never suspend so set the value to default collection. // If the value is 0, it means default, we will just continue to use the default. if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", - spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL - ); self.params.installed_extensions_collection_interval.store( DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, std::sync::atomic::Ordering::SeqCst, ); } else { - info!( - "[NEON_EXT_INT_UPD] Spec Timeout: {}", - spec.suspend_timeout_seconds - ); self.params.installed_extensions_collection_interval.store( spec.suspend_timeout_seconds as u64, std::sync::atomic::Ordering::SeqCst, From 4dee2bfd82b221f90b934e5d68343caa3a0e531f Mon Sep 17 00:00:00 2001 From: Trung Dinh Date: Tue, 8 Jul 2025 14:14:04 -0700 Subject: [PATCH 13/28] pageserver: Introduce config to enable/disable eviction task (#12496) ## Problem We lost capability to explicitly disable the global eviction task (for testing). ## Summary of changes Add an `enabled` flag to `DiskUsageEvictionTaskConfig` to indicate whether we should run the eviction job or not. --- libs/pageserver_api/src/config.rs | 21 ++++- pageserver/src/config.rs | 85 ++++++++++++++----- pageserver/src/disk_usage_eviction_task.rs | 4 +- pageserver/src/utilization.rs | 7 +- ...er_max_throughput_getpage_at_latest_lsn.py | 18 +++- 5 files changed, 103 insertions(+), 32 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 00d6b61399..dc7e9aed7f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -5,6 +5,7 @@ mod tests; use const_format::formatcp; use posthog_client_lite::PostHogClientConfig; +use utils::serde_percent::Percent; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; @@ -223,7 +224,7 @@ pub struct ConfigToml { pub metric_collection_bucket: Option, #[serde(with = "humantime_serde")] pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, #[serde(with = "humantime_serde")] @@ -273,6 +274,7 @@ pub struct ConfigToml { } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(default)] pub struct DiskUsageEvictionTaskConfig { pub max_usage_pct: utils::serde_percent::Percent, pub min_avail_bytes: u64, @@ -283,6 +285,21 @@ pub struct DiskUsageEvictionTaskConfig { /// Select sorting for evicted layers #[serde(default)] pub eviction_order: EvictionOrder, + pub enabled: bool, +} + +impl Default for DiskUsageEvictionTaskConfig { + fn default() -> Self { + Self { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + #[cfg(feature = "testing")] + mock_statvfs: None, + eviction_order: EvictionOrder::default(), + enabled: true, + } + } } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -738,7 +755,7 @@ impl Default for ConfigToml { metric_collection_bucket: (None), - disk_usage_based_eviction: (None), + disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(), test_remote_failures: (0), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 6e22f9f36e..99d7e0ca3a 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -28,7 +28,6 @@ use reqwest::Url; use storage_broker::Uri; use utils::id::{NodeId, TimelineId}; use utils::logging::{LogFormat, SecretString}; -use utils::serde_percent::Percent; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; @@ -146,7 +145,7 @@ pub struct PageServerConf { pub metric_collection_bucket: Option, pub synthetic_size_calculation_interval: Duration, - pub disk_usage_based_eviction: Option, + pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig, pub test_remote_failures: u64, @@ -460,16 +459,7 @@ impl PageServerConf { metric_collection_endpoint, metric_collection_bucket, synthetic_size_calculation_interval, - disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or( - DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 2_000_000_000, - period: Duration::from_secs(60), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: Default::default(), - }, - )), + disk_usage_based_eviction, test_remote_failures, ondemand_download_behavior_treat_error_as_warn, background_task_maximum_delay, @@ -719,8 +709,9 @@ mod tests { use std::time::Duration; use camino::Utf8PathBuf; + use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder}; use rstest::rstest; - use utils::id::NodeId; + use utils::{id::NodeId, serde_percent::Percent}; use super::PageServerConf; @@ -820,19 +811,69 @@ mod tests { .expect("parse_and_validate"); } - #[test] - fn test_config_disk_usage_based_eviction_is_valid() { - let input = r#" + #[rstest] + #[ + case::omit_the_whole_config( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: Default::default(), + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" control_plane_api = "http://localhost:6666" - "#; + "#, + )] + #[ + case::omit_enabled_field( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 1_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: true, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" } + "#, + )] + #[case::disabled( + DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(80).unwrap(), + min_avail_bytes: 2_000_000_000, + period: Duration::from_secs(60), + eviction_order: EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }, + #[cfg(feature = "testing")] + mock_statvfs: None, + enabled: false, + }, + r#" + control_plane_api = "http://localhost:6666" + disk_usage_based_eviction = { enabled = false } + "# + )] + fn test_config_disk_usage_based_eviction_is_valid( + #[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig, + #[case] input: &str, + ) { let config_toml = toml_edit::de::from_str::(input) .expect("disk_usage_based_eviction is valid"); let workdir = Utf8PathBuf::from("/nonexistent"); let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap(); - let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap(); - assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80); - assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000); - assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60)); - assert_eq!(disk_usage_based_eviction.eviction_order, Default::default()); + let disk_usage_based_eviction = config.disk_usage_based_eviction; + assert_eq!( + expected_disk_usage_based_eviction, + disk_usage_based_eviction + ); } } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index e6529fb201..f1d34664a8 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task( tenant_manager: Arc, background_jobs_barrier: completion::Barrier, ) -> Option { - let Some(task_config) = &conf.disk_usage_based_eviction else { + let task_config = &conf.disk_usage_based_eviction; + if !task_config.enabled { info!("disk usage based eviction task not configured"); return None; }; @@ -1268,6 +1269,7 @@ mod filesystem_level_usage { #[cfg(feature = "testing")] mock_statvfs: None, eviction_order: pageserver_api::config::EvictionOrder::default(), + enabled: true, }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 29d1a31aaf..ccfad7a391 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -45,9 +45,10 @@ pub(crate) fn regenerate( let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?; // Fetch the fraction of disk space which may be used - let disk_usable_pct = match conf.disk_usage_based_eviction.clone() { - Some(e) => e.max_usage_pct, - None => Percent::new(100).unwrap(), + let disk_usable_pct = if conf.disk_usage_based_eviction.enabled { + conf.disk_usage_based_eviction.max_usage_pct + } else { + Percent::new(100).unwrap() }; // Express a static value for how many shards we may schedule on one node diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index bf998a2a0a..8e7055ef78 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -71,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man n_clients: int, ): setup_and_run_pagebench_benchmark( - neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + neon_env_builder, + zenbenchmark, + pg_bin, + n_tenants, + pgbench_scale, + duration, + n_clients, ) @@ -86,7 +92,8 @@ def setup_and_run_pagebench_benchmark( ): def record(metric, **kwargs): zenbenchmark.record( - metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs + metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", + **kwargs, ) params: dict[str, tuple[Any, dict[str, Any]]] = {} @@ -104,7 +111,7 @@ def setup_and_run_pagebench_benchmark( # configure cache sizes like in prod page_cache_size = 16384 max_file_descriptors = 500000 - neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}" + neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}" tracing_config = PageserverTracingConfig( sampling_ratio=(0, 1000), @@ -120,7 +127,10 @@ def setup_and_run_pagebench_benchmark( page_cache_size * 8192, {"unit": "byte"}, ), - "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + "pageserver_config_override.max_file_descriptors": ( + max_file_descriptors, + {"unit": ""}, + ), "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}), } ) From c848b995b296124b686b4eeec54b08aee3e539a1 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 8 Jul 2025 22:24:59 +0100 Subject: [PATCH 14/28] safekeeper: trim dead senders before adding more (#12490) ## Problem We only trim the senders if we tried to send a message to them and discovered that the channel is closed. This is problematic if the pageserver keeps connecting while there's nothing to send back for the shard. In this scenario we never trim down the senders list and can panic due to the u8 limit. ## Summary of Changes Trim down the dead senders before adding a new one. Closes LKB-178 --- safekeeper/src/send_interpreted_wal.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 3797ac39d1..72a436e25f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -561,6 +561,20 @@ impl InterpretedWalReader { // Update internal and external state, then reset the WAL stream // if required. let senders = self.shard_senders.entry(shard_id).or_default(); + + // Clean up any shard senders that have dropped out before adding the new + // one. This avoids a build up of dead senders. + senders.retain(|sender| { + let closed = sender.tx.is_closed(); + + if closed { + let sender_id = ShardSenderId::new(shard_id, sender.sender_id); + tracing::info!("Removed shard sender {}", sender_id); + } + + !closed + }); + let new_sender_id = match senders.last() { Some(sender) => sender.sender_id.next(), None => SenderId::first() From 43dbded8c871ef51f778b84fd1b1c264ea54ad44 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 8 Jul 2025 18:32:51 -0400 Subject: [PATCH 15/28] fix(pageserver): disallow lease creation below the applied gc cutoff (#12489) ## Problem close LKB-209 ## Summary of changes - We should not allow lease creation below the applied gc cutoff. - Also removed the condition for `AttachedSingle`. We should always check the lease against the gc cutoff in all attach modes. --------- Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline.rs | 42 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 293f3c484d..4a08172337 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -118,7 +118,6 @@ use crate::pgdatadir_mapping::{ MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; -use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; @@ -1771,30 +1770,31 @@ impl Timeline { existing_lease.clone() } Entry::Vacant(vacant) => { - // Reject already GC-ed LSN if we are in AttachedSingle and - // not blocked by the lsn lease deadline. + // Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted. + let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); + if lsn < *latest_gc_cutoff_lsn { + bail!( + "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", + lsn, + *latest_gc_cutoff_lsn + ); + } + + // We allow create lease for those below the planned gc cutoff if we are still within the grace period + // of GC blocking. let validate = { let conf = self.tenant_conf.load(); - conf.location.attach_mode == AttachmentMode::Single - && !conf.is_gc_blocked_by_lsn_lease_deadline() + !conf.is_gc_blocked_by_lsn_lease_deadline() }; - if init || validate { - let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn(); - if lsn < *latest_gc_cutoff_lsn { - bail!( - "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", - lsn, - *latest_gc_cutoff_lsn - ); - } - if lsn < planned_cutoff { - bail!( - "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", - lsn, - planned_cutoff - ); - } + // Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines + // whether it is a initial lease creation or a renewal. + if (init || validate) && lsn < planned_cutoff { + bail!( + "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}", + lsn, + planned_cutoff + ); } let dt: DateTime = valid_until.into(); From aac1f8efb1086b6db7c599c26912920f08d479b3 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 9 Jul 2025 10:41:36 +0200 Subject: [PATCH 16/28] refactor(compaction): eliminate `CompactionError::CollectKeyspaceError` variant (#12517) The only differentiated handling of it is for `is_critical`, which in turn is a `matches!()` on several variants of the `enum CollectKeyspaceError` which is the value contained insided `CompactionError::CollectKeyspaceError`. This PR introduces a new error for `repartition()`, allowing its immediate callers to inspect it like `is_critical` did. A drive-by fix is more precise classification of WaitLsnError::BadState when mapping to `tonic::Status`. refs - https://databricks.atlassian.net/browse/LKB-182 --- pageserver/src/http/routes.rs | 1 - pageserver/src/pgdatadir_mapping.rs | 17 +++ pageserver/src/tenant.rs | 7 - pageserver/src/tenant/tasks.rs | 3 +- pageserver/src/tenant/timeline.rs | 129 ++++++++++++------- pageserver/src/tenant/timeline/compaction.rs | 43 +++++-- 6 files changed, 130 insertions(+), 70 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 0e40dbcd15..2995a37089 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2502,7 +2502,6 @@ async fn timeline_checkpoint_handler( .map_err(|e| match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, - CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), } )?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 31f38d485f..8532a6938f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError { Cancelled, } +impl CollectKeySpaceError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + CollectKeySpaceError::Decode(_) => false, + CollectKeySpaceError::PageRead(e) => e.is_cancel(), + CollectKeySpaceError::Cancelled => true, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + CollectKeySpaceError::Decode(e) => anyhow::Error::new(e), + CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e), + CollectKeySpaceError::Cancelled => anyhow::Error::new(self), + } + } +} + impl From for CollectKeySpaceError { fn from(err: PageReconstructError) -> Self { match err { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index b0969a96c1..f576119db8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3324,13 +3324,6 @@ impl TenantShard { match err { err if err.is_cancel() => {} CompactionError::ShuttingDown => (), - CompactionError::CollectKeySpaceError(err) => { - // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. - self.compaction_circuit_breaker - .lock() - .unwrap() - .fail(&CIRCUIT_BREAKERS_BROKEN, err); - } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 2ae6b7ff3d..bcece5589a 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -318,7 +318,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); @@ -328,7 +327,7 @@ pub(crate) fn log_compaction_error( .is_some_and(|e| e.is_stopping()); let timeline = root_cause .downcast_ref::() - .is_some_and(|e| e.is_stopping()); + .is_some_and(|e| e.is_cancel()); let buffered_writer_flush_task_canelled = root_cause .downcast_ref::() .is_some_and(|e| e.is_cancel()); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4a08172337..6088f40669 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -585,6 +585,28 @@ pub(crate) enum PageReconstructError { MissingKey(Box), } +impl PageReconstructError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + PageReconstructError::Other(_) => false, + PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(), + PageReconstructError::Cancelled => true, + PageReconstructError::WalRedo(_) => false, + PageReconstructError::MissingKey(_) => false, + } + } + #[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + PageReconstructError::Other(e) => e, + PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(), + PageReconstructError::Cancelled => anyhow::Error::new(self), + PageReconstructError::WalRedo(e) => e, + PageReconstructError::MissingKey(_) => anyhow::Error::new(self), + } + } +} + impl From for PageReconstructError { fn from(value: anyhow::Error) -> Self { // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error @@ -738,17 +760,6 @@ impl std::fmt::Display for MissingKeyError { } } -impl PageReconstructError { - /// Returns true if this error indicates a tenant/timeline shutdown alike situation - pub(crate) fn is_stopping(&self) -> bool { - use PageReconstructError::*; - match self { - Cancelled => true, - Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false, - } - } -} - #[derive(thiserror::Error, Debug)] pub(crate) enum CreateImageLayersError { #[error("timeline shutting down")] @@ -951,13 +962,35 @@ pub enum WaitLsnError { Timeout(String), } +impl WaitLsnError { + pub(crate) fn is_cancel(&self) -> bool { + match self { + WaitLsnError::Shutdown => true, + WaitLsnError::BadState(timeline_state) => match timeline_state { + TimelineState::Loading => false, + TimelineState::Active => false, + TimelineState::Stopping => true, + TimelineState::Broken { .. } => false, + }, + WaitLsnError::Timeout(_) => false, + } + } + pub(crate) fn into_anyhow(self) -> anyhow::Error { + match self { + WaitLsnError::Shutdown => anyhow::Error::new(self), + WaitLsnError::BadState(_) => anyhow::Error::new(self), + WaitLsnError::Timeout(_) => anyhow::Error::new(self), + } + } +} + impl From for tonic::Status { fn from(err: WaitLsnError) -> Self { use tonic::Code; - let code = match &err { - WaitLsnError::Timeout(_) => Code::Internal, - WaitLsnError::BadState(_) => Code::Internal, - WaitLsnError::Shutdown => Code::Unavailable, + let code = if err.is_cancel() { + Code::Unavailable + } else { + Code::Internal }; tonic::Status::new(code, err.to_string()) } @@ -1084,6 +1117,26 @@ enum ImageLayerCreationOutcome { Skip, } +enum RepartitionError { + Other(anyhow::Error), + CollectKeyspace(CollectKeySpaceError), +} + +impl RepartitionError { + fn is_cancel(&self) -> bool { + match self { + RepartitionError::Other(_) => false, + RepartitionError::CollectKeyspace(e) => e.is_cancel(), + } + } + fn into_anyhow(self) -> anyhow::Error { + match self { + RepartitionError::Other(e) => e, + RepartitionError::CollectKeyspace(e) => e.into_anyhow(), + } + } +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -2070,10 +2123,6 @@ impl Timeline { Err(CompactionError::Other(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } - Err(CompactionError::CollectKeySpaceError(_)) => { - // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch. - self.compaction_failed.store(true, AtomicOrdering::Relaxed) - } }; result @@ -4963,7 +5012,7 @@ impl Timeline { ctx, ) .await - .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?; + .map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); @@ -5213,18 +5262,18 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { + ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> { let Ok(mut guard) = self.partitioning.try_write_guard() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called concurrently" ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read(); if lsn < *partition_lsn { - return Err(CompactionError::Other(anyhow!( + return Err(RepartitionError::Other(anyhow!( "repartition() called with LSN going backwards, this should not happen" ))); } @@ -5245,7 +5294,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(RepartitionError::CollectKeyspace)?; let dense_partitioning = dense_ks.partition( &self.shard_identity, partition_size, @@ -6010,9 +6062,6 @@ impl Drop for Timeline { pub(crate) enum CompactionError { #[error("The timeline or pageserver is shutting down")] ShuttingDown, - /// Compaction cannot be done right now; page reconstruction and so on. - #[error("Failed to collect keyspace: {0}")] - CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), } @@ -6020,27 +6069,15 @@ pub(crate) enum CompactionError { impl CompactionError { /// Errors that can be ignored, i.e., cancel and shutdown. pub fn is_cancel(&self) -> bool { - matches!( - self, - Self::ShuttingDown - | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) - | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( - PageReconstructError::Cancelled - )) - ) + matches!(self, Self::ShuttingDown) } - /// Critical errors that indicate data corruption. - pub fn is_critical(&self) -> bool { - matches!( - self, - Self::CollectKeySpaceError( - CollectKeySpaceError::Decode(_) - | CollectKeySpaceError::PageRead( - PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), - ) - ) - ) + pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self { + if err.is_cancel() { + Self::ShuttingDown + } else { + Self::Other(err.into_anyhow()) + } } } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 2c0b98c1e2..c263df1eb2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -16,7 +16,8 @@ use super::{ Timeline, }; -use crate::tenant::timeline::DeltaEntry; +use crate::pgdatadir_mapping::CollectKeySpaceError; +use crate::tenant::timeline::{DeltaEntry, RepartitionError}; use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; @@ -64,7 +65,7 @@ use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_layer_manager_rlock, }; -use crate::tenant::{DeltaLayer, MaybeOffloaded}; +use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; /// Maximum number of deltas before generating an image layer in bottom-most compaction. @@ -572,7 +573,7 @@ impl GcCompactionQueue { match res { Ok(res) => Ok(res), Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => { + Err(CompactionError::Other(_)) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -1417,22 +1418,33 @@ impl Timeline { } // Suppress errors when cancelled. - Err(_) if self.cancel.is_cancelled() => {} + // + // Log other errors but continue. Failure to repartition is normal, if the timeline was just created + // as an empty timeline. Also in unit tests, when we use the timeline as a simple + // key-value store, ignoring the datadir layout. Log the error but continue. + // + // TODO: + // 1. shouldn't we return early here if we observe cancellation + // 2. Experiment: can we stop checking self.cancel here? + Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch Err(err) if err.is_cancel() => {} - - // Alert on critical errors that indicate data corruption. - Err(err) if err.is_critical() => { + Err(RepartitionError::CollectKeyspace( + e @ CollectKeySpaceError::Decode(_) + | e @ CollectKeySpaceError::PageRead( + PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_), + ), + )) => { + // Alert on critical errors that indicate data corruption. critical_timeline!( self.tenant_shard_id, self.timeline_id, - "could not compact, repartitioning keyspace failed: {err:?}" + "could not compact, repartitioning keyspace failed: {e:?}" ); } - - // Log other errors. No partitioning? This is normal, if the timeline was just created - // as an empty timeline. Also in unit tests, when we use the timeline as a simple - // key-value store, ignoring the datadir layout. Log the error but continue. - Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"), + Err(e) => error!( + "could not compact, repartitioning keyspace failed: {:?}", + e.into_anyhow() + ), }; let partition_count = self.partitioning.read().0.0.parts.len(); @@ -2518,7 +2530,10 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?; + let (dense_ks, _sparse_ks) = self + .collect_keyspace(end_lsn, ctx) + .await + .map_err(CompactionError::from_collect_keyspace)?; // TODO(chi): ignore sparse_keyspace for now, compact it in the future. let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks)); From 5ea0bb2d4fa1a9bccac73321512554e3a148427f Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 9 Jul 2025 11:58:46 +0200 Subject: [PATCH 17/28] proxy: Drop unused metrics (#12521) * proxy_control_plane_token_acquire_seconds * proxy_allowed_ips_cache_misses * proxy_vpc_endpoint_id_cache_stats * proxy_access_blocker_flags_cache_stats * proxy_requests_auth_rate_limits_total * proxy_endpoints_auth_rate_limits * proxy_invalid_endpoints_total --- proxy/src/metrics.rs | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 7a21e4ecee..9d1a3d4358 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -10,7 +10,7 @@ use measured::{ Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; -use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; +use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec}; use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; @@ -36,7 +36,6 @@ impl Metrics { metrics.proxy.redis_errors_total.init_all_dense(); metrics.proxy.redis_events_count.init_all_dense(); metrics.proxy.retries_metric.init_all_dense(); - metrics.proxy.invalid_endpoints_total.init_all_dense(); metrics.proxy.connection_failures_total.init_all_dense(); SELF.set(metrics) @@ -80,11 +79,6 @@ pub struct ProxyMetrics { )] pub console_request_latency: HistogramVec, - /// Time it takes to acquire a token to call console plane. - // largest bucket = 3^16 * 0.05ms = 2.15s - #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))] - pub control_plane_token_acquire_seconds: Histogram<16>, - /// Size of the HTTP request body lengths. // smallest bucket = 16 bytes // largest bucket = 4^12 * 16 bytes = 256MB @@ -98,19 +92,10 @@ pub struct ProxyMetrics { /// Number of opened connections to a database. pub http_pool_opened_connections: Gauge, - /// Number of cache hits/misses for allowed ips. - pub allowed_ips_cache_misses: CounterVec>, - /// Number of allowed ips #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_ips_number: Histogram<10>, - /// Number of cache hits/misses for VPC endpoint IDs. - pub vpc_endpoint_id_cache_stats: CounterVec>, - - /// Number of cache hits/misses for access blocker flags. - pub access_blocker_flags_cache_stats: CounterVec>, - /// Number of allowed VPC endpoints IDs #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, @@ -139,21 +124,12 @@ pub struct ProxyMetrics { /// Number of TLS handshake failures pub tls_handshake_failures: Counter, - /// Number of connection requests affected by authentication rate limits - pub requests_auth_rate_limits_total: Counter, - /// HLL approximate cardinality of endpoints that are connecting pub connecting_endpoints: HyperLogLogVec, 32>, /// Number of endpoints affected by errors of a given classification pub endpoints_affected_by_errors: HyperLogLogVec, 32>, - /// Number of endpoints affected by authentication rate limits - pub endpoints_auth_rate_limits: HyperLogLog<32>, - - /// Number of invalid endpoints (per protocol, per rejected). - pub invalid_endpoints_total: CounterVec, - /// Number of retries (per outcome, per retry_type). #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))] pub retries_metric: HistogramVec, From 39159955306d505f9bae5d3677192daba9d005be Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 9 Jul 2025 13:42:46 +0200 Subject: [PATCH 18/28] pageserver/client_grpc: add rich Pageserver gRPC client (#12462) ## Problem For the communicator, we need a rich Pageserver gRPC client. Touches #11735. Requires #12434. ## Summary of changes This patch adds an initial rich Pageserver gRPC client. It supports: * Sharded tenants across multiple Pageservers. * Pooling of connections, clients, and streams for efficient resource use. * Concurrent use by many callers. * Internal handling of GetPage bidirectional streams, with pipelining and error handling. * Automatic retries. * Observability. The client is still under development. In particular, it needs GetPage batch splitting, shard map updates, and performance optimization. This will be addressed in follow-up PRs. --- Cargo.lock | 2 + libs/compute_api/src/spec.rs | 2 +- libs/pageserver_api/src/shard.rs | 6 +- libs/utils/src/shard.rs | 6 + pageserver/client_grpc/Cargo.toml | 2 + pageserver/client_grpc/src/client.rs | 303 +++++++++++++++++++++++++++ pageserver/client_grpc/src/lib.rs | 17 +- pageserver/client_grpc/src/retry.rs | 151 +++++++++++++ pageserver/page_api/src/model.rs | 15 ++ pageserver/src/page_service.rs | 2 + 10 files changed, 491 insertions(+), 15 deletions(-) create mode 100644 pageserver/client_grpc/src/client.rs create mode 100644 pageserver/client_grpc/src/retry.rs diff --git a/Cargo.lock b/Cargo.lock index 893932fb9d..558e8e2295 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4498,7 +4498,9 @@ name = "pageserver_client_grpc" version = "0.1.0" dependencies = [ "anyhow", + "compute_api", "futures", + "pageserver_api", "pageserver_page_api", "tokio", "tokio-stream", diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 60311aa3e6..0eeab2bebc 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -442,7 +442,7 @@ pub struct JwksSettings { } /// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. -#[derive(Clone, Copy, Debug, Default)] +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub enum PageserverProtocol { /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. #[default] diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 5a13aace64..d6f4cd5e66 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 { /// /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional /// and will be handled at higher levels when shards are split. -fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber { +pub fn key_to_shard_number( + count: ShardCount, + stripe_size: ShardStripeSize, + key: &Key, +) -> ShardNumber { // Fast path for un-sharded tenants or broadcast keys if count < ShardCount(2) || key_is_shard0(key) { return ShardNumber(0); diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index f2b81373e2..5a0edf8cea 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber { } } +impl std::fmt::Display for ShardCount { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + impl std::fmt::Display for ShardSlug<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml index 5a3a2761c2..0a8bcad2ef 100644 --- a/pageserver/client_grpc/Cargo.toml +++ b/pageserver/client_grpc/Cargo.toml @@ -6,7 +6,9 @@ license.workspace = true [dependencies] anyhow.workspace = true +compute_api.workspace = true futures.workspace = true +pageserver_api.workspace = true pageserver_page_api.workspace = true tokio.workspace = true tokio-stream.workspace = true diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs new file mode 100644 index 0000000000..5bccdeede3 --- /dev/null +++ b/pageserver/client_grpc/src/client.rs @@ -0,0 +1,303 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use anyhow::anyhow; +use tracing::instrument; + +use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; +use crate::retry::Retry; +use compute_api::spec::PageserverProtocol; +use pageserver_api::key::{Key, rel_block_to_key}; +use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; +use pageserver_page_api as page_api; +use utils::id::{TenantId, TimelineId}; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; + +/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the +/// basic `page_api::Client` gRPC client, and supports: +/// +/// * Sharded tenants across multiple Pageservers. +/// * Pooling of connections, clients, and streams for efficient resource use. +/// * Concurrent use by many callers. +/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling. +/// * Automatic retries. +/// * Observability. +/// +/// TODO: this client does not support base backups or LSN leases, as these are only used by +/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards. +pub struct PageserverClient { + // TODO: support swapping out the shard map, e.g. via an ArcSwap. + shards: Shards, + retry: Retry, +} + +impl PageserverClient { + /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given + /// in the shard map, which must be complete and must use gRPC URLs. + pub fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + shard_map: HashMap, + stripe_size: ShardStripeSize, + auth_token: Option, + ) -> anyhow::Result { + let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?; + Ok(Self { + shards, + retry: Retry, + }) + } + + /// Returns whether a relation exists. + #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] + pub async fn check_rel_exists( + &self, + req: page_api::CheckRelExistsRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.check_rel_exists(req).await + }) + .await + } + + /// Returns the total size of a database, as # of bytes. + #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))] + pub async fn get_db_size( + &self, + req: page_api::GetDbSizeRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_db_size(req).await + }) + .await + } + + /// Fetches a page. The `request_id` must be unique across all in-flight requests. + /// + /// Unlike the `page_api::Client`, this client automatically converts `status_code` into + /// `tonic::Status` errors. All responses will have `GetPageStatusCode::Ok`. + #[instrument(skip_all, fields( + req_id = %req.request_id, + rel = %req.rel, + blkno = %req.block_numbers[0], + blks = %req.block_numbers.len(), + lsn = %req.read_lsn, + ))] + pub async fn get_page( + &self, + req: page_api::GetPageRequest, + ) -> tonic::Result { + // TODO: this needs to split batch requests across shards and reassemble responses into a + // single response. It must also re-split the batch in case the shard map changes. For now, + // just use the first page. + let key = rel_block_to_key( + req.rel, + req.block_numbers + .first() + .copied() + .ok_or_else(|| tonic::Status::invalid_argument("no block numbers provided"))?, + ); + + self.retry + .with(async || { + let stream = self.shards.get_for_key(key).stream().await; + let resp = stream.send(req.clone()).await?; + + if resp.status_code != page_api::GetPageStatusCode::Ok { + return Err(tonic::Status::new( + resp.status_code.into(), + resp.reason.unwrap_or_else(|| String::from("unknown error")), + )); + } + + Ok(resp) + }) + .await + } + + /// Returns the size of a relation, as # of blocks. + #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))] + pub async fn get_rel_size( + &self, + req: page_api::GetRelSizeRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // Relation metadata is only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_rel_size(req).await + }) + .await + } + + /// Fetches an SLRU segment. + #[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))] + pub async fn get_slru_segment( + &self, + req: page_api::GetSlruSegmentRequest, + ) -> tonic::Result { + self.retry + .with(async || { + // SLRU segments are only available on shard 0. + let mut client = self.shards.get_zero().client().await?; + client.get_slru_segment(req).await + }) + .await + } +} + +/// Tracks the tenant's shards. +struct Shards { + /// The shard count. + /// + /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention. + count: ShardCount, + /// The stripe size. Only used for sharded tenants. + stripe_size: ShardStripeSize, + /// Shards by shard index. + /// + /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`. + /// + /// INVARIANT: every shard 0..count is present. + /// INVARIANT: shard 0 is always present. + map: HashMap, +} + +impl Shards { + /// Creates a new set of shards based on a shard map. + fn new( + tenant_id: TenantId, + timeline_id: TimelineId, + shard_map: HashMap, + stripe_size: ShardStripeSize, + auth_token: Option, + ) -> anyhow::Result { + let count = match shard_map.len() { + 0 => return Err(anyhow!("no shards provided")), + 1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()` + n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")), + n => ShardCount::new(n as u8), + }; + + let mut map = HashMap::new(); + for (shard_id, url) in shard_map { + // The shard index must match the computed shard count, even for unsharded tenants. + if shard_id.shard_count != count { + return Err(anyhow!("invalid shard index {shard_id}, expected {count}")); + } + // The shard index' number and count must be consistent. + if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 { + return Err(anyhow!("invalid shard index {shard_id}")); + } + // The above conditions guarantee that we have all shards 0..count: len() matches count, + // shard number < count, and numbers are unique (via hashmap). + let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?; + map.insert(shard_id, shard); + } + + Ok(Self { + count, + stripe_size, + map, + }) + } + + /// Looks up the given shard. + #[allow(clippy::result_large_err)] // TODO: check perf impact + fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> { + self.map + .get(&shard_id) + .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}"))) + } + + /// Looks up the shard that owns the given key. + fn get_for_key(&self, key: Key) -> &Shard { + let shard_number = key_to_shard_number(self.count, self.stripe_size, &key); + self.get(ShardIndex::new(shard_number, self.count)) + .expect("must exist") + } + + /// Returns shard 0. + fn get_zero(&self) -> &Shard { + self.get(ShardIndex::new(ShardNumber(0), self.count)) + .expect("always present") + } +} + +/// A single shard. +/// +/// TODO: consider separate pools for normal and bulk traffic, with different settings. +struct Shard { + /// Dedicated channel pool for this shard. Shared by all clients/streams in this shard. + _channel_pool: Arc, + /// Unary gRPC client pool for this shard. Uses the shared channel pool. + client_pool: Arc, + /// GetPage stream pool for this shard. Uses a dedicated client pool, but shares the channel + /// pool with unary clients. + stream_pool: Arc, +} + +impl Shard { + /// Creates a new shard. It has its own dedicated resource pools. + fn new( + url: String, + tenant_id: TenantId, + timeline_id: TimelineId, + shard_id: ShardIndex, + auth_token: Option, + ) -> anyhow::Result { + // Sanity-check that the URL uses gRPC. + if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc { + return Err(anyhow!("invalid shard URL {url}: must use gRPC")); + } + + // Use a common channel pool for all clients, to multiplex unary and stream requests across + // the same TCP connections. The channel pool is unbounded (but client pools are bounded). + let channel_pool = ChannelPool::new(url)?; + + // Dedicated client pool for unary requests. + let client_pool = ClientPool::new( + channel_pool.clone(), + tenant_id, + timeline_id, + shard_id, + auth_token.clone(), + ); + + // Stream pool with dedicated client pool. If this shared a client pool with unary requests, + // long-lived streams could fill up the client pool and starve out unary requests. It shares + // the same underlying channel pool with unary clients though, which is unbounded. + let stream_pool = StreamPool::new(ClientPool::new( + channel_pool.clone(), + tenant_id, + timeline_id, + shard_id, + auth_token, + )); + + Ok(Self { + _channel_pool: channel_pool, + client_pool, + stream_pool, + }) + } + + /// Returns a pooled client for this shard. + async fn client(&self) -> tonic::Result { + self.client_pool + .get() + .await + .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}"))) + } + + /// Returns a pooled stream for this shard. + async fn stream(&self) -> StreamGuard { + self.stream_pool.get().await + } +} diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs index c900e1a939..2a59f9868c 100644 --- a/pageserver/client_grpc/src/lib.rs +++ b/pageserver/client_grpc/src/lib.rs @@ -1,14 +1,5 @@ -//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client` -//! gRPC client, and supports: -//! -//! * Sharded tenants across multiple Pageservers. -//! * Pooling of connections, clients, and streams for efficient resource use. -//! * Concurrent use by many callers. -//! * Internal handling of GetPage bidirectional streams. -//! * Automatic retries. -//! * Observability. -//! -//! The client is under development, this package is just a shell. - -#[allow(unused)] +mod client; mod pool; +mod retry; + +pub use client::PageserverClient; diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs new file mode 100644 index 0000000000..b0473204d7 --- /dev/null +++ b/pageserver/client_grpc/src/retry.rs @@ -0,0 +1,151 @@ +use std::time::Duration; + +use tokio::time::Instant; +use tracing::{error, info, warn}; + +use utils::backoff::exponential_backoff_duration; + +/// A retry handler for Pageserver gRPC requests. +/// +/// This is used instead of backoff::retry for better control and observability. +pub struct Retry; + +impl Retry { + /// The per-request timeout. + // TODO: tune these, and/or make them configurable. Should we retry forever? + const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + /// The total timeout across all attempts + const TOTAL_TIMEOUT: Duration = Duration::from_secs(60); + /// The initial backoff duration. + const BASE_BACKOFF: Duration = Duration::from_millis(10); + /// The maximum backoff duration. + const MAX_BACKOFF: Duration = Duration::from_secs(10); + /// If true, log successful requests. For debugging. + const LOG_SUCCESS: bool = false; + + /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors, + /// using the current tracing span for context. + /// + /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default + /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`]. + pub async fn with(&self, mut f: F) -> tonic::Result + where + F: FnMut() -> O, + O: Future>, + { + let started = Instant::now(); + let deadline = started + Self::TOTAL_TIMEOUT; + let mut last_error = None; + let mut retries = 0; + loop { + // Set up a future to wait for the backoff (if any) and run the request with a timeout. + let backoff_and_try = async { + // NB: sleep() always sleeps 1ms, even when given a 0 argument. See: + // https://github.com/tokio-rs/tokio/issues/6866 + if let Some(backoff) = Self::backoff_duration(retries) { + tokio::time::sleep(backoff).await; + } + + let request_started = Instant::now(); + tokio::time::timeout(Self::REQUEST_TIMEOUT, f()) + .await + .map_err(|_| { + tonic::Status::deadline_exceeded(format!( + "request timed out after {:.3}s", + request_started.elapsed().as_secs_f64() + )) + })? + }; + + // Wait for the backoff and request, or bail out if the total timeout is exceeded. + let result = tokio::select! { + result = backoff_and_try => result, + + _ = tokio::time::sleep_until(deadline) => { + let last_error = last_error.unwrap_or_else(|| { + tonic::Status::deadline_exceeded(format!( + "request timed out after {:.3}s", + started.elapsed().as_secs_f64() + )) + }); + error!( + "giving up after {:.3}s and {retries} retries, last error {:?}: {}", + started.elapsed().as_secs_f64(), last_error.code(), last_error.message(), + ); + return Err(last_error); + } + }; + + match result { + // Success, return the result. + Ok(result) => { + if retries > 0 || Self::LOG_SUCCESS { + info!( + "request succeeded after {retries} retries in {:.3}s", + started.elapsed().as_secs_f64(), + ); + } + + return Ok(result); + } + + // Error, retry or bail out. + Err(status) => { + let (code, message) = (status.code(), status.message()); + let attempt = retries + 1; + + if !Self::should_retry(code) { + // NB: include the attempt here too. This isn't necessarily the first + // attempt, because the error may change between attempts. + error!( + "request failed with {code:?}: {message}, not retrying (attempt {attempt})" + ); + return Err(status); + } + + warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})"); + + retries += 1; + last_error = Some(status); + } + } + } + } + + /// Returns the backoff duration for the given retry attempt, or None for no backoff. + fn backoff_duration(retry: usize) -> Option { + let backoff = exponential_backoff_duration( + retry as u32, + Self::BASE_BACKOFF.as_secs_f64(), + Self::MAX_BACKOFF.as_secs_f64(), + ); + (!backoff.is_zero()).then_some(backoff) + } + + /// Returns true if the given status code should be retries. + fn should_retry(code: tonic::Code) -> bool { + match code { + tonic::Code::Ok => panic!("unexpected Ok status code"), + + // These codes are transient, so retry them. + tonic::Code::Aborted => true, + tonic::Code::Cancelled => true, + tonic::Code::DeadlineExceeded => true, // maybe transient slowness + tonic::Code::Internal => true, // maybe transient failure? + tonic::Code::ResourceExhausted => true, + tonic::Code::Unavailable => true, + + // The following codes will like continue to fail, so don't retry. + tonic::Code::AlreadyExists => false, + tonic::Code::DataLoss => false, + tonic::Code::FailedPrecondition => false, + tonic::Code::InvalidArgument => false, + tonic::Code::NotFound => false, + tonic::Code::OutOfRange => false, + tonic::Code::PermissionDenied => false, + tonic::Code::Unauthenticated => false, + tonic::Code::Unimplemented => false, + tonic::Code::Unknown => false, + } + } +} diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 4497fc6fc7..c5b6f06879 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -602,6 +602,21 @@ impl TryFrom for GetPageStatusCode { } } +impl From for tonic::Code { + fn from(status_code: GetPageStatusCode) -> Self { + use tonic::Code; + + match status_code { + GetPageStatusCode::Unknown => Code::Unknown, + GetPageStatusCode::Ok => Code::Ok, + GetPageStatusCode::NotFound => Code::NotFound, + GetPageStatusCode::InvalidRequest => Code::InvalidArgument, + GetPageStatusCode::InternalError => Code::Internal, + GetPageStatusCode::SlowDown => Code::ResourceExhausted, + } + } +} + // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other // shards will error. #[derive(Clone, Copy, Debug)] diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6b614deac8..70fdb2e789 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -3353,6 +3353,8 @@ impl GrpcPageServiceHandler { /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a /// GetPageResponse with an appropriate status code to avoid terminating the stream. /// + /// TODO: verify that the requested pages belong to this shard. + /// /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or /// split them up in the client or server. From 7049003cf790768eddf38a2e004d046e282adfb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 9 Jul 2025 14:02:44 +0200 Subject: [PATCH 19/28] storcon: print viability of --timelines-onto-safekeepers (#12485) The `--timelines-onto-safekeepers` flag is very consequential in the sense that it controls every single timeline creation. However, we don't have any automatic insight whether enabling the option will break things or not. The main way things can break is by misconfigured safekeepers, say they are marked as paused in the storcon db. The best input so far we can obtain via manually connecting via storcon_cli and listing safekeepers, but this is cumbersome and manual so prone to human error. So at storcon startup, do a simulated "test creation" in which we call `timelines_onto_safekeepers` with the configuration provided to us, and print whether it was successful or not. No actual timeline is created, and nothing is written into the storcon db. The heartbeat info will not have reached us at that point yet, but that's okay, because we still fall back to safekeepers that don't have any heartbeat. Also print some general scheduling policy stats on initial safekeeper load. Part of #11670. --- libs/pageserver_api/src/models.rs | 2 +- storage_controller/src/service.rs | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 16545364c1..6735320484 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -384,7 +384,7 @@ pub struct SafekeepersInfo { pub safekeepers: Vec, } -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Debug)] pub struct SafekeeperInfo { pub id: NodeId, pub hostname: String, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 403ae15b59..ed6643d641 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1677,7 +1677,21 @@ impl Service { .collect::>>()?; let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); - tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + let count_policy = |policy| { + safekeepers + .iter() + .filter(|sk| sk.1.scheduling_policy() == policy) + .count() + }; + let active_sk_count = count_policy(SkSchedulingPolicy::Active); + let activating_sk_count = count_policy(SkSchedulingPolicy::Activating); + let pause_sk_count = count_policy(SkSchedulingPolicy::Pause); + let decom_sk_count = count_policy(SkSchedulingPolicy::Decomissioned); + tracing::info!( + "Loaded {} safekeepers from database. Active {active_sk_count}, activating {activating_sk_count}, \ + paused {pause_sk_count}, decomissioned {decom_sk_count}.", + safekeepers.len() + ); metrics::METRICS_REGISTRY .metrics_group .storage_controller_safekeeper_nodes @@ -1969,6 +1983,14 @@ impl Service { } }); + // Check that there is enough safekeepers configured that we can create new timelines + let test_sk_res = this.safekeepers_for_new_timeline().await; + tracing::info!( + timeline_safekeeper_count = config.timeline_safekeeper_count, + timelines_onto_safekeepers = config.timelines_onto_safekeepers, + "viability test result (test timeline creation on safekeepers): {test_sk_res:?}", + ); + Ok(this) } From 4ee0da0a2056d31d6d2194d862354e43193cbe23 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 9 Jul 2025 15:49:21 +0300 Subject: [PATCH 20/28] Check prefetch response before assignment to slot (#12371) ## Problem See [Slack Channel](https://databricks.enterprise.slack.com/archives/C091LHU6NNB) Dropping connection without resetting prefetch state can cause request/response mismatch. And lack of check response correctness in communicator_prefetch_lookupv can cause data corruption. ## Summary of changes 1. Validate response before assignment to prefetch slot. 2. Consume prefetch requests before sending any other requests. --------- Co-authored-by: Kosntantin Knizhnik Co-authored-by: Konstantin Knizhnik --- pgxn/neon/communicator.c | 219 ++++++++++++++++++++++++--------------- 1 file changed, 135 insertions(+), 84 deletions(-) diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 7c84be7d15..bd53855eab 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -65,6 +65,7 @@ #include "port/pg_iovec.h" #include "postmaster/interrupt.h" #include "replication/walsender.h" +#include "storage/ipc.h" #include "utils/timeout.h" #include "bitmap.h" @@ -412,6 +413,47 @@ compact_prefetch_buffers(void) return false; } +/* + * Check that prefetch response matches the slot + */ +static void +check_getpage_response(PrefetchRequest* slot, NeonResponse* resp) +{ + if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse) + { + neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld", + resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused); + } + if (neon_protocol_version >= 3) + { + NRelFileInfo rinfo = BufTagGetNRelFileInfo(slot->buftag); + if (resp->tag == T_NeonGetPageResponse) + { + NeonGetPageResponse * getpage_resp = (NeonGetPageResponse *)resp; + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != slot->buftag.forkNum || + getpage_resp->req.blkno != slot->buftag.blockNum) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum); + } + } + else if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } +} + /* * If there might be responses still in the TCP buffer, then we should try to * use those, to reduce any TCP backpressure on the OS/PS side. @@ -446,15 +488,18 @@ communicator_prefetch_pump_state(void) if (response == NULL) break; + check_getpage_response(slot, response); + /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, + { + neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long) slot->my_ring_index, (long) MyPState->ring_receive); - + } /* update prefetch state */ MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; @@ -593,6 +638,21 @@ readahead_buffer_resize(int newsize, void *extra) } +/* + * Callback to be called on backend exit to ensure correct state of compute-PS communication + * in case of backend cancel + */ +static void +prefetch_on_exit(int code, Datum arg) +{ + if (code != 0) /* do disconnect only on abnormal backend termination */ + { + shardno_t shard_no = DatumGetInt32(arg); + prefetch_on_ps_disconnect(); + page_server->disconnect(shard_no); + } +} + /* * Make sure that there are no responses still in the buffer. @@ -605,6 +665,11 @@ consume_prefetch_responses(void) { if (MyPState->ring_receive < MyPState->ring_unused) prefetch_wait_for(MyPState->ring_unused - 1); + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); } static void @@ -722,10 +787,12 @@ prefetch_read(PrefetchRequest *slot) if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, + { + neon_shard_log(slot->shard_no, PANIC, "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long)slot->my_ring_index, (long)MyPState->ring_receive); + } /* * Copy the request info so that if an error happens and the prefetch @@ -741,14 +808,18 @@ prefetch_read(PrefetchRequest *slot) MemoryContextSwitchTo(old); if (response) { + check_getpage_response(slot, response); + /* The slot should still be valid */ if (slot->status != PRFS_REQUESTED || slot->response != NULL || slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(shard_no, ERROR, + { + neon_shard_log(shard_no, PANIC, "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", slot->status, slot->response, (long) slot->my_ring_index, (long) MyPState->ring_receive); + } /* update prefetch state */ MyPState->n_responses_buffered += 1; @@ -820,11 +891,10 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { - bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; - /* Prohibit callig of prefetch_pump_state */ - START_PREFETCH_RECEIVE_WORK(); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); while (MyPState->ring_receive < MyPState->ring_unused) { @@ -854,9 +924,6 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } - /* Restore guard */ - readpage_reentrant_guard = save_readpage_reentrant_guard; - /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -865,6 +932,8 @@ prefetch_on_ps_disconnect(void) MyPState->n_requests_inflight; MyNeonCounters->getpage_prefetches_buffered = MyPState->n_responses_buffered; + + RESUME_INTERRUPTS(); } /* @@ -1027,16 +1096,11 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe /* * Ignore errors */ - if (slot->response->tag != T_NeonGetPageResponse) + if (slot->response->tag == T_NeonErrorResponse) { - if (slot->response->tag != T_NeonErrorResponse) - { - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); - } continue; } + Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */ memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); @@ -1351,7 +1415,7 @@ equal_requests(NeonRequest* a, NeonRequest* b) static NeonResponse * page_server_request(void const *req) { - NeonResponse *resp; + NeonResponse *resp = NULL; BufferTag tag = {0}; shardno_t shard_no; @@ -1371,7 +1435,7 @@ page_server_request(void const *req) tag.blockNum = ((NeonGetPageRequest *) req)->blkno; break; default: - neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + neon_log(PANIC, "Unexpected request tag: %d", messageTag(req)); } shard_no = get_shard_number(&tag); @@ -1384,9 +1448,12 @@ page_server_request(void const *req) shard_no = 0; } - do + consume_prefetch_responses(); + + PG_TRY(); { - PG_TRY(); + before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + do { while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no)) @@ -1394,30 +1461,24 @@ page_server_request(void const *req) /* do nothing */ } MyNeonCounters->pageserver_open_requests++; - consume_prefetch_responses(); resp = page_server->receive(shard_no); MyNeonCounters->pageserver_open_requests--; - } - PG_CATCH(); - { - /* - * Cancellation in this code needs to be handled better at some - * point, but this currently seems fine for now. - */ - page_server->disconnect(shard_no); - MyNeonCounters->pageserver_open_requests = 0; + } while (resp == NULL); + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + } + PG_CATCH(); + { + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); + page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + RESUME_INTERRUPTS(); - /* - * We know for sure we're not working on any prefetch pages after - * this. - */ - END_PREFETCH_RECEIVE_WORK(); + PG_RE_THROW(); + } + PG_END_TRY(); - PG_RE_THROW(); - } - PG_END_TRY(); - - } while (resp == NULL); return resp; } @@ -1502,7 +1563,7 @@ nm_pack_request(NeonRequest *msg) case T_NeonDbSizeResponse: case T_NeonGetSlruSegmentResponse: default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + neon_log(PANIC, "unexpected neon message tag 0x%02x", msg->tag); break; } return s; @@ -1654,7 +1715,7 @@ nm_unpack_response(StringInfo s) case T_NeonDbSizeRequest: case T_NeonGetSlruSegmentRequest: default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); + neon_log(PANIC, "unexpected neon message tag 0x%02x", tag); break; } @@ -1983,7 +2044,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || exists_resp->req.forknum != request.forknum) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); @@ -2014,7 +2075,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); } @@ -2158,6 +2219,7 @@ Retry: Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); Assert(hashkey.buftag.blockNum == base_blockno + i); + /* We already checked that response match request when storing it in slot */ resp = slot->response; switch (resp->tag) @@ -2165,21 +2227,6 @@ Retry: case T_NeonGetPageResponse: { NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since || - !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || - getpage_resp->req.forknum != forkNum || - getpage_resp->req.blkno != base_blockno + i) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); - } - } memcpy(buffer, getpage_resp->page, BLCKSZ); /* @@ -2192,17 +2239,6 @@ Retry: break; } case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); - } - } ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", @@ -2257,7 +2293,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || relsize_resp->req.forknum != forknum) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); @@ -2288,7 +2324,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns * break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); } @@ -2327,7 +2363,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) if (!equal_requests(resp, &request.hdr) || dbsize_resp->req.dbNode != dbNode) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); @@ -2356,7 +2392,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); } @@ -2372,7 +2408,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re { int n_blocks; shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ - NeonResponse *resp; + NeonResponse *resp = NULL; NeonGetSlruSegmentRequest request; request = (NeonGetSlruSegmentRequest) { @@ -2383,14 +2419,29 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re .segno = segno }; - do + consume_prefetch_responses(); + + PG_TRY(); { - while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + do + { + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + resp = page_server->receive(shard_no); + } while (resp == NULL); + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + } + PG_CATCH(); + { + cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no)); + /* Nothing should cancel disconnect: we should not leave connection in opaque state */ + HOLD_INTERRUPTS(); + page_server->disconnect(shard_no); + RESUME_INTERRUPTS(); - consume_prefetch_responses(); - - resp = page_server->receive(shard_no); - } while (resp == NULL); + PG_RE_THROW(); + } + PG_END_TRY(); switch (resp->tag) { @@ -2403,7 +2454,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re slru_resp->req.kind != kind || slru_resp->req.segno != segno) { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); @@ -2435,7 +2486,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re break; default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, + NEON_PANIC_CONNECTION_STATE(0, PANIC, "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); } From e7d18bc1884ae7f5448fff99bd02f6d9390d5566 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 9 Jul 2025 13:55:10 +0100 Subject: [PATCH 21/28] Replica promotion in compute_ctl (#12183) Add `/promote` method for `compute_ctl` promoting secondary replica to primary, depends on secondary being prewarmed. Add `compute-ctl` mode to `test_replica_promotes`, testing happy path only (no corner cases yet) Add openapi spec for `/promote` and `/lfc` handlers https://github.com/neondatabase/cloud/issues/19011 Resolves: https://github.com/neondatabase/cloud/issues/29807 --- Cargo.lock | 1 + compute_tools/Cargo.toml | 2 +- compute_tools/src/compute.rs | 7 +- compute_tools/src/compute_promote.rs | 132 +++++++++++++++ compute_tools/src/http/openapi_spec.yaml | 145 +++++++++++++++++ compute_tools/src/http/routes/mod.rs | 1 + compute_tools/src/http/routes/promote.rs | 14 ++ compute_tools/src/http/server.rs | 3 +- compute_tools/src/lib.rs | 1 + libs/compute_api/src/responses.rs | 30 +++- test_runner/fixtures/endpoint/http.py | 15 +- test_runner/regress/test_replica_promotes.py | 162 ++++++++++++------- 12 files changed, 448 insertions(+), 65 deletions(-) create mode 100644 compute_tools/src/compute_promote.rs create mode 100644 compute_tools/src/http/routes/promote.rs diff --git a/Cargo.lock b/Cargo.lock index 558e8e2295..c49a2daba7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1348,6 +1348,7 @@ dependencies = [ "p256 0.13.2", "pageserver_page_api", "postgres", + "postgres-types", "postgres_initdb", "postgres_versioninfo", "regex", diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 1a03022d89..910bae3bda 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -66,7 +66,7 @@ url.workspace = true uuid.workspace = true walkdir.workspace = true x509-cert.workspace = true - +postgres-types.workspace = true postgres_versioninfo.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index ec6e6c1634..0496d38e67 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState, - LfcPrewarmState, TlsConfig, + LfcPrewarmState, PromoteState, TlsConfig, }; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, @@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; -use tokio::task::JoinHandle; -use tokio::{spawn, time}; +use tokio::{spawn, sync::watch, task::JoinHandle, time}; use tracing::{Instrument, debug, error, info, instrument, warn}; use url::Url; use utils::id::{TenantId, TimelineId}; @@ -175,6 +174,7 @@ pub struct ComputeState { /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if /// mode == ComputeMode::Primary. None otherwise pub terminate_flush_lsn: Option, + pub promote_state: Option>, pub metrics: ComputeMetrics, } @@ -192,6 +192,7 @@ impl ComputeState { lfc_prewarm_state: LfcPrewarmState::default(), lfc_offload_state: LfcOffloadState::default(), terminate_flush_lsn: None, + promote_state: None, } } diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs new file mode 100644 index 0000000000..42256faa22 --- /dev/null +++ b/compute_tools/src/compute_promote.rs @@ -0,0 +1,132 @@ +use crate::compute::ComputeNode; +use anyhow::{Context, Result, bail}; +use compute_api::{ + responses::{LfcPrewarmState, PromoteState, SafekeepersLsn}, + spec::ComputeMode, +}; +use std::{sync::Arc, time::Duration}; +use tokio::time::sleep; +use utils::lsn::Lsn; + +impl ComputeNode { + /// Returns only when promote fails or succeeds. If a network error occurs + /// and http client disconnects, this does not stop promotion, and subsequent + /// calls block until promote finishes. + /// Called by control plane on secondary after primary endpoint is terminated + pub async fn promote(self: &Arc, safekeepers_lsn: SafekeepersLsn) -> PromoteState { + let cloned = self.clone(); + let start_promotion = || { + let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted); + tokio::spawn(async move { + tx.send(match cloned.promote_impl(safekeepers_lsn).await { + Ok(_) => PromoteState::Completed, + Err(err) => { + tracing::error!(%err, "promoting"); + PromoteState::Failed { + error: err.to_string(), + } + } + }) + }); + rx + }; + + let mut task; + // self.state is unlocked after block ends so we lock it in promote_impl + // and task.changed() is reached + { + task = self + .state + .lock() + .unwrap() + .promote_state + .get_or_insert_with(start_promotion) + .clone() + } + task.changed().await.expect("promote sender dropped"); + task.borrow().clone() + } + + // Why do we have to supply safekeepers? + // For secondary we use primary_connection_conninfo so safekeepers field is empty + async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> { + { + let state = self.state.lock().unwrap(); + let mode = &state.pspec.as_ref().unwrap().spec.mode; + if *mode != ComputeMode::Replica { + bail!("{} is not replica", mode.to_type_str()); + } + + // we don't need to query Postgres so not self.lfc_prewarm_state() + match &state.lfc_prewarm_state { + LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => { + bail!("prewarm not requested or pending") + } + LfcPrewarmState::Failed { error } => { + tracing::warn!(%error, "replica prewarm failed") + } + _ => {} + } + } + + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")?; + + let primary_lsn = safekeepers_lsn.wal_flush_lsn; + let mut last_wal_replay_lsn: Lsn = Lsn::INVALID; + const RETRIES: i32 = 20; + for i in 0..=RETRIES { + let row = client + .query_one("SELECT pg_last_wal_replay_lsn()", &[]) + .await + .context("getting last replay lsn")?; + let lsn: u64 = row.get::(0).into(); + last_wal_replay_lsn = lsn.into(); + if last_wal_replay_lsn >= primary_lsn { + break; + } + tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}"); + sleep(Duration::from_secs(1)).await; + } + if last_wal_replay_lsn < primary_lsn { + bail!("didn't catch up with primary in {RETRIES} retries"); + } + + // using $1 doesn't work with ALTER SYSTEM SET + let safekeepers_sql = format!( + "ALTER SYSTEM SET neon.safekeepers='{}'", + safekeepers_lsn.safekeepers + ); + client + .query(&safekeepers_sql, &[]) + .await + .context("setting safekeepers")?; + client + .query("SELECT pg_reload_conf()", &[]) + .await + .context("reloading postgres config")?; + let row = client + .query_one("SELECT * FROM pg_promote()", &[]) + .await + .context("pg_promote")?; + if !row.get::(0) { + bail!("pg_promote() returned false"); + } + + let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf) + .await + .context("connecting to postgres")?; + let row = client + .query_one("SHOW transaction_read_only", &[]) + .await + .context("getting transaction_read_only")?; + if row.get::(0) == "on" { + bail!("replica in read only mode after promotion"); + } + + let mut state = self.state.lock().unwrap(); + state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary; + Ok(()) + } +} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bbdb7d0917..eaf33d1f82 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -83,6 +83,87 @@ paths: schema: $ref: "#/components/schemas/DbsAndRoles" + /promote: + post: + tags: + - Promotion + summary: Promote secondary replica to primary + description: "" + operationId: promoteReplica + requestBody: + description: Promote requests data + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/SafekeepersLsn" + responses: + 200: + description: Promote succeeded or wasn't started + content: + application/json: + schema: + $ref: "#/components/schemas/PromoteState" + 500: + description: Promote failed + content: + application/json: + schema: + $ref: "#/components/schemas/PromoteState" + + /lfc/prewarm: + post: + summary: Request LFC Prewarm + parameters: + - name: from_endpoint + in: query + schema: + type: string + description: "" + operationId: lfcPrewarm + responses: + 202: + description: LFC prewarm started + 429: + description: LFC prewarm ongoing + get: + tags: + - Prewarm + summary: Get LFC prewarm state + description: "" + operationId: getLfcPrewarmState + responses: + 200: + description: Prewarm state + content: + application/json: + schema: + $ref: "#/components/schemas/LfcPrewarmState" + + /lfc/offload: + post: + summary: Request LFC offload + description: "" + operationId: lfcOffload + responses: + 202: + description: LFC offload started + 429: + description: LFC offload ongoing + get: + tags: + - Prewarm + summary: Get LFC offloading state + description: "" + operationId: getLfcOffloadState + responses: + 200: + description: Offload state + content: + application/json: + schema: + $ref: "#/components/schemas/LfcOffloadState" + /database_schema: get: tags: @@ -497,6 +578,70 @@ components: type: string example: "1.0.0" + SafekeepersLsn: + type: object + required: + - safekeepers + - wal_flush_lsn + properties: + safekeepers: + description: Primary replica safekeepers + type: string + wal_flush_lsn: + description: Primary last WAL flush LSN + type: string + + LfcPrewarmState: + type: object + required: + - status + - total + - prewarmed + - skipped + properties: + status: + description: Lfc prewarm status + enum: [not_prewarmed, prewarming, completed, failed] + type: string + error: + description: Lfc prewarm error, if any + type: string + total: + description: Total pages processed + type: integer + prewarmed: + description: Total pages prewarmed + type: integer + skipped: + description: Pages processed but not prewarmed + type: integer + + LfcOffloadState: + type: object + required: + - status + properties: + status: + description: Lfc offload status + enum: [not_offloaded, offloading, completed, failed] + type: string + error: + description: Lfc offload error, if any + type: string + + PromoteState: + type: object + required: + - status + properties: + status: + description: Promote result + enum: [not_promoted, completed, failed] + type: string + error: + description: Promote error, if any + type: string + InstalledExtensions: type: object properties: diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs index 432e66a830..dd71f663eb 100644 --- a/compute_tools/src/http/routes/mod.rs +++ b/compute_tools/src/http/routes/mod.rs @@ -14,6 +14,7 @@ pub(in crate::http) mod insights; pub(in crate::http) mod lfc; pub(in crate::http) mod metrics; pub(in crate::http) mod metrics_json; +pub(in crate::http) mod promote; pub(in crate::http) mod status; pub(in crate::http) mod terminate; diff --git a/compute_tools/src/http/routes/promote.rs b/compute_tools/src/http/routes/promote.rs new file mode 100644 index 0000000000..bc5f93b4da --- /dev/null +++ b/compute_tools/src/http/routes/promote.rs @@ -0,0 +1,14 @@ +use crate::http::JsonResponse; +use axum::Form; +use http::StatusCode; + +pub(in crate::http) async fn promote( + compute: axum::extract::State>, + Form(safekeepers_lsn): Form, +) -> axum::response::Response { + let state = compute.promote(safekeepers_lsn).await; + if let compute_api::responses::PromoteState::Failed { error } = state { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error); + } + JsonResponse::success(StatusCode::OK, state) +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index d5d2427971..17939e39d4 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -23,7 +23,7 @@ use super::{ middleware::authorize::Authorize, routes::{ check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions, - grants, insights, lfc, metrics, metrics_json, status, terminate, + grants, insights, lfc, metrics, metrics_json, promote, status, terminate, }, }; use crate::compute::ComputeNode; @@ -87,6 +87,7 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm)) .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload)) + .route("/promote", post(promote::promote)) .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) .route("/database_schema", get(database_schema::get_schema_dump)) diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 3899a1ca76..2d5d4565b7 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -12,6 +12,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod compute_prewarm; +pub mod compute_promote; pub mod disk_quota; pub mod extension_server; pub mod installed_extensions; diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index a54411b06a..e10c381fb4 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse { pub version: ExtVersion, } -#[derive(Serialize, Default, Debug, Clone)] +#[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcPrewarmState { #[default] @@ -58,6 +58,17 @@ pub enum LfcPrewarmState { }, } +impl Display for LfcPrewarmState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"), + LfcPrewarmState::Prewarming => f.write_str("Prewarming"), + LfcPrewarmState::Completed => f.write_str("Completed"), + LfcPrewarmState::Failed { error } => write!(f, "Error({error})"), + } + } +} + #[derive(Serialize, Default, Debug, Clone, PartialEq)] #[serde(tag = "status", rename_all = "snake_case")] pub enum LfcOffloadState { @@ -70,6 +81,23 @@ pub enum LfcOffloadState { }, } +#[derive(Serialize, Debug, Clone, PartialEq)] +#[serde(tag = "status", rename_all = "snake_case")] +/// Response of /promote +pub enum PromoteState { + NotPromoted, + Completed, + Failed { error: String }, +} + +#[derive(Deserialize, Serialize, Default, Debug, Clone)] +#[serde(rename_all = "snake_case")] +/// Result of /safekeepers_lsn +pub struct SafekeepersLsn { + pub safekeepers: String, + pub wal_flush_lsn: utils::lsn::Lsn, +} + /// Response of the /status API #[derive(Serialize, Debug, Deserialize)] #[serde(rename_all = "snake_case")] diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index 294c52321b..1d278095ce 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -2,11 +2,12 @@ from __future__ import annotations import urllib.parse from enum import StrEnum -from typing import TYPE_CHECKING, final +from typing import TYPE_CHECKING, Any, final import requests from requests.adapters import HTTPAdapter from requests.auth import AuthBase +from requests.exceptions import ReadTimeout from typing_extensions import override from fixtures.log_helper import log @@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session): wait_until(offloaded) + def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False): + url = f"http://localhost:{self.external_port}/promote" + if disconnect: + try: # send first request to start promote and disconnect + self.post(url, data=safekeepers_lsn, timeout=0.001) + except ReadTimeout: + pass # wait on second request which returns on promotion finish + res = self.post(url, data=safekeepers_lsn) + res.raise_for_status() + json: dict[str, str] = res.json() + return json + def database_schema(self, database: str): res = self.get( f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}", diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py index 4486901bae..1f26269f40 100644 --- a/test_runner/regress/test_replica_promotes.py +++ b/test_runner/regress/test_replica_promotes.py @@ -1,29 +1,51 @@ """ -File with secondary->primary promotion testing. - -This far, only contains a test that we don't break and that the data is persisted. +Secondary -> primary promotion testing """ +from enum import StrEnum from typing import cast import psycopg2 +import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup -from fixtures.pg_version import PgVersion +from fixtures.utils import USE_LFC +from psycopg2.extensions import cursor as Cursor from pytest import raises def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None): ep.stop(mode="immediate-terminate") lsn = ep.terminate_flush_lsn - if expected_lsn is not None: + assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}" + if lsn is not None: assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}" - else: - assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}" -def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): +def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]: + cur.execute( + """ + SELECT pg_current_wal_insert_lsn(), + pg_current_wal_lsn(), + pg_current_wal_flush_lsn() + """ + ) + return cast("tuple[str, str, str]", cur.fetchone()) + + +class PromoteMethod(StrEnum): + COMPUTE_CTL = "compute-ctl" + POSTGRES = "postgres" + + +METHOD_OPTIONS = [e for e in PromoteMethod] +METHOD_IDS = [e.value for e in PromoteMethod] + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS) +def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod): """ Test that a replica safely promotes, and can commit data updates which show up when the primary boots up after the promoted secondary endpoint @@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): with primary.connect() as primary_conn: primary_cur = primary_conn.cursor() + primary_cur.execute("create extension neon") primary_cur.execute( "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)" ) primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") - primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone()) + + lsn_triple = get_lsn_triple(primary_cur) log.info(f"Primary: Current LSN after workload is {lsn_triple}") expected_primary_lsn: Lsn = Lsn(lsn_triple[2]) primary_cur.execute("show neon.safekeepers") safekeepers = primary_cur.fetchall()[0][0] - wait_replica_caughtup(primary, secondary) + if method == PromoteMethod.COMPUTE_CTL: + primary.http_client().offload_lfc() + else: + wait_replica_caughtup(primary, secondary) with secondary.connect() as secondary_conn: secondary_cur = secondary_conn.cursor() secondary_cur.execute("select count(*) from t") - assert secondary_cur.fetchone() == (100,) with raises(psycopg2.Error): @@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): secondary_cur.execute("select count(*) from t") assert secondary_cur.fetchone() == (100,) + primary_endpoint_id = primary.endpoint_id stop_and_check_lsn(primary, expected_primary_lsn) # Reconnect to the secondary to make sure we get a read-write connection promo_conn = secondary.connect() promo_cur = promo_conn.cursor() - promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") - promo_cur.execute("select pg_reload_conf()") + if method == PromoteMethod.COMPUTE_CTL: + client = secondary.http_client() + client.prewarm_lfc(primary_endpoint_id) + # control plane knows safekeepers, simulate it by querying primary + assert (lsn := primary.terminate_flush_lsn) + safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn} + assert client.promote(safekeepers_lsn)["status"] == "completed" + else: + promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'") + promo_cur.execute("select pg_reload_conf()") + promo_cur.execute("SELECT * FROM pg_promote()") + assert promo_cur.fetchone() == (True,) - promo_cur.execute("SELECT * FROM pg_promote()") - assert promo_cur.fetchone() == (True,) - promo_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}") + lsn_triple = get_lsn_triple(promo_cur) + log.info(f"Secondary: LSN after promotion is {lsn_triple}") # Reconnect to the secondary to make sure we get a read-write connection - with secondary.connect() as new_primary_conn: - new_primary_cur = new_primary_conn.cursor() + with secondary.connect() as conn, conn.cursor() as new_primary_cur: new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (100,) @@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): ) assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)] - new_primary_cur = new_primary_conn.cursor() + new_primary_cur = conn.cursor() new_primary_cur.execute("select payload from t") assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (200,) - new_primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}") - with secondary.connect() as second_viewpoint_conn: - new_primary_cur = second_viewpoint_conn.cursor() + lsn_triple = get_lsn_triple(new_primary_cur) + log.info(f"Secondary: LSN after workload is {lsn_triple}") + expected_promoted_lsn = Lsn(lsn_triple[2]) + + with secondary.connect() as conn, conn.cursor() as new_primary_cur: new_primary_cur.execute("select payload from t") assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)] - # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline) - - # secondaries don't sync safekeepers on finish so LSN will be None - stop_and_check_lsn(secondary, None) + if method == PromoteMethod.COMPUTE_CTL: + # compute_ctl's /promote switches replica type to Primary so it syncs + # safekeepers on finish + stop_and_check_lsn(secondary, expected_promoted_lsn) + else: + # on testing postgres, we don't update replica type, secondaries don't + # sync so lsn should be None + stop_and_check_lsn(secondary, None) primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2") - with primary.connect() as new_primary: - new_primary_cur = new_primary.cursor() - new_primary_cur.execute( - """ - SELECT pg_current_wal_insert_lsn(), - pg_current_wal_lsn(), - pg_current_wal_flush_lsn() - """ - ) - lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone()) + with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur: + lsn_triple = get_lsn_triple(new_primary_cur) expected_primary_lsn = Lsn(lsn_triple[2]) log.info(f"New primary: Boot LSN is {lsn_triple}") @@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion): new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)") new_primary_cur.execute("select count(*) from t") assert new_primary_cur.fetchone() == (300,) - stop_and_check_lsn(primary, expected_primary_lsn) + + +@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping") +def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv): + """ + Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens + once, and no error is thrown + """ + env: NeonEnv = neon_simple_env + primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + with primary.connect() as conn, conn.cursor() as cur: + cur.execute("create extension neon") + cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)") + cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)") + cur.execute("show neon.safekeepers") + safekeepers = cur.fetchall()[0][0] + + primary.http_client().offload_lfc() + primary_endpoint_id = primary.endpoint_id + primary.stop(mode="immediate-terminate") + assert (lsn := primary.terminate_flush_lsn) + + client = secondary.http_client() + client.prewarm_lfc(primary_endpoint_id) + safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn} + assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed" + + with secondary.connect() as conn, conn.cursor() as cur: + cur.execute("select count(*) from t") + assert cur.fetchone() == (100,) + cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload") + cur.execute("select count(*) from t") + assert cur.fetchone() == (200,) From 8f3351fa91c5170dbeb42b0fd84fd5ceec7b3ead Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 9 Jul 2025 16:17:22 +0200 Subject: [PATCH 22/28] pageserver/client_grpc: split GetPage batches across shards (#12469) ## Problem The rich gRPC Pageserver client needs to split GetPage batches that straddle multiple shards. Touches #11735. Requires #12462. ## Summary of changes Adds a `GetPageSplitter` which splits `GetPageRequest` that span multiple shards, and then reassembles the responses. Dispatches per-shard requests in parallel. --- Cargo.lock | 1 + pageserver/client_grpc/Cargo.toml | 1 + pageserver/client_grpc/src/client.rs | 90 ++++++++++---- pageserver/client_grpc/src/lib.rs | 1 + pageserver/client_grpc/src/split.rs | 172 +++++++++++++++++++++++++++ 5 files changed, 240 insertions(+), 25 deletions(-) create mode 100644 pageserver/client_grpc/src/split.rs diff --git a/Cargo.lock b/Cargo.lock index c49a2daba7..caed814d5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4499,6 +4499,7 @@ name = "pageserver_client_grpc" version = "0.1.0" dependencies = [ "anyhow", + "bytes", "compute_api", "futures", "pageserver_api", diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml index 0a8bcad2ef..84e27abb84 100644 --- a/pageserver/client_grpc/Cargo.toml +++ b/pageserver/client_grpc/Cargo.toml @@ -6,6 +6,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +bytes.workspace = true compute_api.workspace = true futures.workspace = true pageserver_api.workspace = true diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index 5bccdeede3..c21ce2e47d 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -2,13 +2,15 @@ use std::collections::HashMap; use std::sync::Arc; use anyhow::anyhow; +use futures::stream::FuturesUnordered; +use futures::{FutureExt as _, StreamExt as _}; use tracing::instrument; use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}; use crate::retry::Retry; +use crate::split::GetPageSplitter; use compute_api::spec::PageserverProtocol; -use pageserver_api::key::{Key, rel_block_to_key}; -use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; +use pageserver_api::shard::ShardStripeSize; use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; @@ -78,10 +80,11 @@ impl PageserverClient { .await } - /// Fetches a page. The `request_id` must be unique across all in-flight requests. + /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically + /// splits requests that straddle shard boundaries, and assembles the responses. /// - /// Unlike the `page_api::Client`, this client automatically converts `status_code` into - /// `tonic::Status` errors. All responses will have `GetPageStatusCode::Ok`. + /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status` + /// errors. All responses will have `GetPageStatusCode::Ok`. #[instrument(skip_all, fields( req_id = %req.request_id, rel = %req.rel, @@ -93,22 +96,55 @@ impl PageserverClient { &self, req: page_api::GetPageRequest, ) -> tonic::Result { - // TODO: this needs to split batch requests across shards and reassemble responses into a - // single response. It must also re-split the batch in case the shard map changes. For now, - // just use the first page. - let key = rel_block_to_key( - req.rel, - req.block_numbers - .first() - .copied() - .ok_or_else(|| tonic::Status::invalid_argument("no block numbers provided"))?, - ); + // Make sure we have at least one page. + if req.block_numbers.is_empty() { + return Err(tonic::Status::invalid_argument("no block number")); + } - self.retry + // Fast path: request is for a single shard. + if let Some(shard_id) = + GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size) + { + return self.get_page_for_shard(shard_id, req).await; + } + + // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and + // reassemble the responses. + // + // TODO: when we support shard map updates, we need to detect when it changes and re-split + // the request on errors. + let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size); + + let mut shard_requests: FuturesUnordered<_> = splitter + .drain_requests() + .map(|(shard_id, shard_req)| { + // NB: each request will retry internally. + self.get_page_for_shard(shard_id, shard_req) + .map(move |result| result.map(|resp| (shard_id, resp))) + }) + .collect(); + + while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? { + splitter.add_response(shard_id, shard_response)?; + } + + splitter.assemble_response() + } + + /// Fetches pages that belong to the given shard. + #[instrument(skip_all, fields(shard = %shard_id))] + async fn get_page_for_shard( + &self, + shard_id: ShardIndex, + req: page_api::GetPageRequest, + ) -> tonic::Result { + let resp = self + .retry .with(async || { - let stream = self.shards.get_for_key(key).stream().await; + let stream = self.shards.get(shard_id)?.stream().await; let resp = stream.send(req.clone()).await?; + // Convert per-request errors into a tonic::Status. if resp.status_code != page_api::GetPageStatusCode::Ok { return Err(tonic::Status::new( resp.status_code.into(), @@ -118,7 +154,18 @@ impl PageserverClient { Ok(resp) }) - .await + .await?; + + // Make sure we got the right number of pages. + // NB: check outside of the retry loop, since we don't want to retry this. + let (expected, actual) = (req.block_numbers.len(), resp.page_images.len()); + if expected != actual { + return Err(tonic::Status::internal(format!( + "expected {expected} pages for shard {shard_id}, got {actual}", + ))); + } + + Ok(resp) } /// Returns the size of a relation, as # of blocks. @@ -216,13 +263,6 @@ impl Shards { .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}"))) } - /// Looks up the shard that owns the given key. - fn get_for_key(&self, key: Key) -> &Shard { - let shard_number = key_to_shard_number(self.count, self.stripe_size, &key); - self.get(ShardIndex::new(shard_number, self.count)) - .expect("must exist") - } - /// Returns shard 0. fn get_zero(&self) -> &Shard { self.get(ShardIndex::new(ShardNumber(0), self.count)) diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs index 2a59f9868c..3fc7178be2 100644 --- a/pageserver/client_grpc/src/lib.rs +++ b/pageserver/client_grpc/src/lib.rs @@ -1,5 +1,6 @@ mod client; mod pool; mod retry; +mod split; pub use client::PageserverClient; diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs new file mode 100644 index 0000000000..5bbcaab393 --- /dev/null +++ b/pageserver/client_grpc/src/split.rs @@ -0,0 +1,172 @@ +use std::collections::HashMap; + +use bytes::Bytes; + +use pageserver_api::key::rel_block_to_key; +use pageserver_api::shard::{ShardStripeSize, key_to_shard_number}; +use pageserver_page_api as page_api; +use utils::shard::{ShardCount, ShardIndex}; + +/// Splits GetPageRequests that straddle shard boundaries and assembles the responses. +/// TODO: add tests for this. +pub struct GetPageSplitter { + /// The original request ID. Used for all shard requests. + request_id: page_api::RequestID, + /// Split requests by shard index. + requests: HashMap, + /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble + /// the response pages in the same order as the original request. + block_shards: Vec, + /// Page responses by shard index. Will be assembled into a single response. + responses: HashMap>, +} + +impl GetPageSplitter { + /// Checks if the given request only touches a single shard, and returns the shard ID. This is + /// the common case, so we check first in order to avoid unnecessary allocations and overhead. + /// The caller must ensure that the request has at least one block number, or this will panic. + pub fn is_single_shard( + req: &page_api::GetPageRequest, + count: ShardCount, + stripe_size: ShardStripeSize, + ) -> Option { + // Fast path: unsharded tenant. + if count.is_unsharded() { + return Some(ShardIndex::unsharded()); + } + + // Find the base shard index for the first page, and compare with the rest. + let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages")); + let shard_number = key_to_shard_number(count, stripe_size, &key); + + req.block_numbers + .iter() + .skip(1) // computed above + .all(|&blkno| { + let key = rel_block_to_key(req.rel, blkno); + key_to_shard_number(count, stripe_size, &key) == shard_number + }) + .then_some(ShardIndex::new(shard_number, count)) + } + + /// Splits the given request. + pub fn split( + req: page_api::GetPageRequest, + count: ShardCount, + stripe_size: ShardStripeSize, + ) -> Self { + // The caller should make sure we don't split requests unnecessarily. + debug_assert!( + Self::is_single_shard(&req, count, stripe_size).is_none(), + "unnecessary request split" + ); + + // Split the requests by shard index. + let mut requests = HashMap::with_capacity(2); // common case + let mut block_shards = Vec::with_capacity(req.block_numbers.len()); + for blkno in req.block_numbers { + let key = rel_block_to_key(req.rel, blkno); + let shard_number = key_to_shard_number(count, stripe_size, &key); + let shard_id = ShardIndex::new(shard_number, count); + + let shard_req = requests + .entry(shard_id) + .or_insert_with(|| page_api::GetPageRequest { + request_id: req.request_id, + request_class: req.request_class, + rel: req.rel, + read_lsn: req.read_lsn, + block_numbers: Vec::new(), + }); + shard_req.block_numbers.push(blkno); + block_shards.push(shard_id); + } + + Self { + request_id: req.request_id, + responses: HashMap::with_capacity(requests.len()), + requests, + block_shards, + } + } + + /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations. + pub fn drain_requests( + &mut self, + ) -> impl Iterator { + self.requests.drain() + } + + /// Adds a response from the given shard. + #[allow(clippy::result_large_err)] + pub fn add_response( + &mut self, + shard_id: ShardIndex, + response: page_api::GetPageResponse, + ) -> tonic::Result<()> { + // The caller should already have converted status codes into tonic::Status. + assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok); + + // Make sure the response matches the request ID. + if response.request_id != self.request_id { + return Err(tonic::Status::internal(format!( + "response ID {} does not match request ID {}", + response.request_id, self.request_id + ))); + } + + // Add the response data to the map. + let old = self.responses.insert(shard_id, response.page_images); + + if old.is_some() { + return Err(tonic::Status::internal(format!( + "duplicate response for shard {shard_id}", + ))); + } + + Ok(()) + } + + /// Assembles the shard responses into a single response. Responses must be present for all + /// relevant shards, and the total number of pages must match the original request. + #[allow(clippy::result_large_err)] + pub fn assemble_response(self) -> tonic::Result { + let mut response = page_api::GetPageResponse { + request_id: self.request_id, + status_code: page_api::GetPageStatusCode::Ok, + reason: None, + page_images: Vec::with_capacity(self.block_shards.len()), + }; + + // Set up per-shard page iterators we can pull from. + let mut shard_responses = HashMap::with_capacity(self.responses.len()); + for (shard_id, responses) in self.responses { + shard_responses.insert(shard_id, responses.into_iter()); + } + + // Reassemble the responses in the same order as the original request. + for shard_id in &self.block_shards { + let page = shard_responses + .get_mut(shard_id) + .ok_or_else(|| { + tonic::Status::internal(format!("missing response for shard {shard_id}")) + })? + .next() + .ok_or_else(|| { + tonic::Status::internal(format!("missing page from shard {shard_id}")) + })?; + response.page_images.push(page); + } + + // Make sure there are no additional pages. + for (shard_id, mut pages) in shard_responses { + if pages.next().is_some() { + return Err(tonic::Status::internal(format!( + "extra pages returned from shard {shard_id}" + ))); + } + } + + Ok(response) + } +} From bc6a756f1c41145c7eba01b105c7d3a3e08829cf Mon Sep 17 00:00:00 2001 From: Mikhail Date: Wed, 9 Jul 2025 15:29:45 +0100 Subject: [PATCH 23/28] ci: lint openapi specs using redocly (#12510) We need to lint specs for pageserver, endpoint storage, and safekeeper #0000 --- .github/workflows/build_and_test.yml | 18 ++++++++++++ Makefile | 9 ++++++ compute_tools/src/http/openapi_spec.yaml | 29 ------------------- pageserver/src/http/openapi_spec.yml | 37 +++++++++--------------- 4 files changed, 41 insertions(+), 52 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 864abad574..cc9534f05d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -87,6 +87,24 @@ jobs: uses: ./.github/workflows/build-build-tools-image.yml secrets: inherit + lint-openapi-spec: + runs-on: ubuntu-22.04 + needs: [ meta, check-permissions ] + # We do need to run this in `.*-rc-pr` because of hotfixes. + if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + steps: + - name: Harden the runner (Audit all outbound calls) + uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 + with: + egress-policy: audit + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - run: make lint-openapi-spec + check-codestyle-python: needs: [ meta, check-permissions, build-build-tools-image ] # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes. diff --git a/Makefile b/Makefile index 4b31e26810..d07ac907b4 100644 --- a/Makefile +++ b/Makefile @@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit +.PHONY: lint-openapi-spec +lint-openapi-spec: + # operation-2xx-response: pageserver timeline delete returns 404 on success + find . -iname "openapi_spec.y*ml" -exec\ + docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\ + --skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\ + --skip-rule=no-server-example.com --skip-rule=operation-2xx-response\ + lint {} \+ + # Targets for building PostgreSQL are defined in postgres.mk. # # But if the caller has indicated that PostgreSQL is already diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index eaf33d1f82..3c58b284b3 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -416,15 +416,6 @@ components: total_startup_ms: type: integer - Info: - type: object - description: Information about VM/Pod. - required: - - num_cpus - properties: - num_cpus: - type: integer - DbsAndRoles: type: object description: Databases and Roles @@ -642,26 +633,6 @@ components: description: Promote error, if any type: string - InstalledExtensions: - type: object - properties: - extensions: - description: Contains list of installed extensions. - type: array - items: - type: object - properties: - extname: - type: string - version: - type: string - items: - type: string - n_databases: - type: integer - owned_by_superuser: - type: integer - SetRoleGrantsRequest: type: object required: diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index e8d1367d6c..3ffc80f19a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -116,26 +116,6 @@ paths: schema: type: string - /v1/tenant/{tenant_id}/timeline: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - get: - description: Get timelines for tenant - responses: - "200": - description: TimelineInfo - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/TimelineInfo" - - /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id @@ -618,7 +598,7 @@ paths: schema: $ref: "#/components/schemas/SecondaryProgress" - /v1/tenant/{tenant_id}/timeline/: + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path @@ -685,6 +665,17 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" + get: + description: Get timelines for tenant + responses: + "200": + description: TimelineInfo + content: + application/json: + schema: + type: array + items: + $ref: "#/components/schemas/TimelineInfo" /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor: parameters: @@ -767,7 +758,7 @@ paths: $ref: "#/components/schemas/ServiceUnavailableError" - /v1/tenant/: + /v1/tenant: get: description: Get tenants list responses: @@ -847,7 +838,7 @@ paths: items: $ref: "#/components/schemas/TenantInfo" - /v1/tenant/{tenant_id}/config/: + /v1/tenant/{tenant_id}/config: parameters: - name: tenant_id in: path From 5c0de4ee8ccc39f67d718c9575897b6ca0c388b8 Mon Sep 17 00:00:00 2001 From: Alexander Lakhin Date: Wed, 9 Jul 2025 18:22:54 +0300 Subject: [PATCH 24/28] Fix parameter name in workload for test_multiple_subscription_branching (#12522) ## Problem As discovered in https://github.com/neondatabase/neon/issues/12394, test_multiple_subscription_branching generates skewed data distribution, that leads to test failures when the unevenly filled last table receives even more data. for table t0: pub_res = (42001,), sub_res = (42001,) for table t1: pub_res = (29001,), sub_res = (29001,) for table t2: pub_res = (21001,), sub_res = (21001,) for table t3: pub_res = (21001,), sub_res = (21001,) for table t4: pub_res = (1711001,), sub_res = (1711001,) ## Summary of changes Fix the name of the workload parameter to generate data as expected. --- test_runner/regress/test_subscriber_branching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 83bebc19be..63772f7cd4 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -332,7 +332,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();") - def start_publisher_workload(table_num: int, duration: int): + def start_publisher_workload(i: int, duration: int): start = time.time() with endpoint.cursor(dbname="publisher_db") as cur: while time.time() - start < duration: From 78a6daa874b216711e3afa204bbbb0eb1af3b64b Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:28:04 +0200 Subject: [PATCH 25/28] Add retrying in Random ops test if parent branch is not found. (#12513) ## Problem Due to a lag in replication, we sometimes cannot get the parent branch definition just after completion of the Public API restore call. This leads to the test failures. https://databricks.atlassian.net/browse/LKB-279 ## Summary of changes The workaround is implemented. Now test retries up to 30 seconds, waiting for the branch definition to appear. --------- Co-authored-by: Alexey Masterov --- test_runner/random_ops/test_random_ops.py | 103 ++++++++++++---------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py index d3815c40bb..5c43b06bc5 100644 --- a/test_runner/random_ops/test_random_ops.py +++ b/test_runner/random_ops/test_random_ops.py @@ -117,7 +117,9 @@ class NeonBranch: def create_child_branch(self) -> NeonBranch | None: return self.project.create_branch(self.id) - def create_ro_endpoint(self) -> NeonEndpoint: + def create_ro_endpoint(self) -> NeonEndpoint | None: + if not self.project.check_limit_endpoints(): + return None return NeonEndpoint( self.project, self.neon_api.create_endpoint(self.project_id, self.id, "read_only", {})["endpoint"], @@ -151,11 +153,26 @@ class NeonBranch: return self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"]) parent_id: str = res["branch"]["parent_id"] + # XXX Retry get parent details to work around the issue + # https://databricks.atlassian.net/browse/LKB-279 + target_time = datetime.now() + timedelta(seconds=30) + while datetime.now() < target_time: + try: + parent_def = self.neon_api.get_branch_details(self.project_id, parent_id) + except HTTPError as he: + if he.response.status_code == 404: + log.info("Branch not found, waiting...") + time.sleep(1) + else: + raise HTTPError(he) from he + else: + break + else: + raise RuntimeError(f"Branch {parent_id} not found") + # Creates an object for the parent branch # After the reset operation a new parent branch is created - parent = NeonBranch( - self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True - ) + parent = NeonBranch(self.project, parent_def, True) self.project.branches[parent_id] = parent self.parent = parent parent.children[self.id] = self @@ -168,29 +185,21 @@ class NeonBranch: source_timestamp: str | None = None, preserve_under_name: str | None = None, ) -> dict[str, Any] | None: + if not self.project.check_limit_branches(): + return None endpoints = [ep for ep in self.endpoints.values() if ep.type == "read_only"] # Terminate all the benchmarks running to prevent errors. Errors in benchmark during pgbench are expected for ep in endpoints: ep.terminate_benchmark() self.terminate_benchmark() - try: - res: dict[str, Any] = self.neon_api.restore_branch( - self.project_id, - self.id, - source_branch_id, - source_lsn, - source_timestamp, - preserve_under_name, - ) - except HTTPError as he: - if ( - he.response.status_code == 422 - and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED" - ): - log.info("Branch limit exceeded, skipping") - return None - else: - raise HTTPError(he) from he + res: dict[str, Any] = self.neon_api.restore_branch( + self.project_id, + self.id, + source_branch_id, + source_lsn, + source_timestamp, + preserve_under_name, + ) self.project.wait() self.start_benchmark() for ep in endpoints: @@ -239,19 +248,30 @@ class NeonProject: def delete(self) -> None: self.neon_api.delete_project(self.id) + def check_limit_branches(self) -> bool: + if self.limits["max_branches"] == -1 or len(self.branches) < self.limits["max_branches"]: + return True + log.info("branch limit exceeded (%s/%s)", len(self.branches), self.limits["max_branches"]) + return False + + def check_limit_endpoints(self) -> bool: + if ( + self.limits["max_read_only_endpoints"] == -1 + or self.read_only_endpoints_total < self.limits["max_read_only_endpoints"] + ): + return True + log.info( + "Maximum read only endpoint limit exceeded (%s/%s)", + self.read_only_endpoints_total, + self.limits["max_read_only_endpoints"], + ) + return False + def create_branch(self, parent_id: str | None = None) -> NeonBranch | None: self.wait() - try: - branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id) - except HTTPError as he: - if ( - he.response.status_code == 422 - and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED" - ): - log.info("Branch limit exceeded, skipping") - return None - else: - raise HTTPError(he) from he + if not self.check_limit_branches(): + return None + branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id) new_branch = NeonBranch(self, branch_def) self.wait() return new_branch @@ -388,17 +408,9 @@ def do_action(project: NeonProject, action: str) -> bool: log.info("Action: %s", action) if action == "new_branch": log.info("Trying to create a new branch") - if 0 <= project.limits["max_branches"] <= len(project.branches): - log.info( - "Maximum branch limit exceeded (%s of %s)", - len(project.branches), - project.limits["max_branches"], - ) - return False parent = project.branches[ random.choice(list(set(project.branches.keys()) - project.reset_branches)) ] - log.info("Parent: %s", parent) child = parent.create_child_branch() if child is None: return False @@ -413,16 +425,11 @@ def do_action(project: NeonProject, action: str) -> bool: log.info("Leaf branches not found, skipping") return False elif action == "new_ro_endpoint": - if 0 <= project.limits["max_read_only_endpoints"] <= project.read_only_endpoints_total: - log.info( - "Maximum read only endpoint limit exceeded (%s of %s)", - project.read_only_endpoints_total, - project.limits["max_read_only_endpoints"], - ) - return False ep = random.choice( [br for br in project.branches.values() if br.id not in project.reset_branches] ).create_ro_endpoint() + if ep is None: + return False log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id) ep.start_benchmark() elif action == "delete_ro_endpoint": From 5ec82105cccf5a8a6c1d28e6f817c703d4b77a26 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 9 Jul 2025 11:35:19 -0400 Subject: [PATCH 26/28] fix(pageserver): ensure remote size gets computed (#12520) ## Problem Follow up of #12400 ## Summary of changes We didn't set remote_size_mb to Some when initialized so it never gets computed :( Also added a new API to force refresh the properties. Signed-off-by: Alex Chi Z --- pageserver/src/feature_resolver.rs | 3 ++- pageserver/src/http/routes.rs | 20 ++++++++++++++++++++ test_runner/fixtures/pageserver/http.py | 7 +++++++ test_runner/regress/test_feature_flag.py | 9 +++++++++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 65cac8eea1..f0178fd9b3 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -409,11 +409,12 @@ impl TenantFeatureResolver { /// Refresh the cached properties and flags on the critical path. pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) { - let mut remote_size_mb = None; + let mut remote_size_mb = Some(0.0); for timeline in tenant_shard.list_timelines() { let size = timeline.metrics.resident_physical_size_get(); if size == 0 { remote_size_mb = None; + break; } if let Some(ref mut remote_size_mb) = remote_size_mb { *remote_size_mb += size as f64 / 1024.0 / 1024.0; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2995a37089..3612686b5d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3691,6 +3691,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow Ok(()) } +async fn force_refresh_feature_flag( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + tenant + .feature_resolver + .refresh_properties_and_flags(&tenant); + json_response(StatusCode::OK, ()) +} + async fn tenant_evaluate_feature_flag( request: Request, _cancel: CancellationToken, @@ -4156,6 +4173,9 @@ pub fn make_router( .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| { api_handler(r, tenant_evaluate_feature_flag) }) + .post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| { + api_handler(r, force_refresh_feature_flag) + }) .put("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put) }) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d9037f2d08..79cfba8da6 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1247,3 +1247,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) return res.json() + + def force_refresh_feature_flag(self, tenant_id: TenantId | TenantShardId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/force_refresh_feature_flag", + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py index 2712d13dcc..c6c192b6f1 100644 --- a/test_runner/regress/test_feature_flag.py +++ b/test_runner/regress/test_feature_flag.py @@ -49,3 +49,12 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder): env.initial_tenant, "test-feature-flag" )["result"] ) + + env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant) + + # Check if the properties exist + result = env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + ) + assert "tenant_remote_size_mb" in result["properties"] + assert "tenant_id" in result["properties"] From 2f71eda00fbfea3ee3382fef3024b6376676726e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 9 Jul 2025 18:12:59 +0200 Subject: [PATCH 27/28] pageserver/client_grpc: add separate pools for bulk requests (#12475) ## Problem GetPage bulk requests such as prefetches and vacuum can head-of-line block foreground requests, causing increased latency. Touches #11735. Requires #12469. ## Summary of changes * Use dedicated channel/client/stream pools for bulk GetPage requests. * Use lower concurrency but higher queue depth for bulk pools. * Make pool limits configurable. * Require unbounded client pool for stream pool, to avoid accidental starvation. --- pageserver/client_grpc/src/client.rs | 112 +++++++++++++++----- pageserver/client_grpc/src/pool.rs | 148 +++++++++++++++------------ pageserver/page_api/src/model.rs | 15 ++- 3 files changed, 180 insertions(+), 95 deletions(-) diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs index c21ce2e47d..63852868c3 100644 --- a/pageserver/client_grpc/src/client.rs +++ b/pageserver/client_grpc/src/client.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::num::NonZero; use std::sync::Arc; use anyhow::anyhow; @@ -15,6 +16,32 @@ use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::{ShardCount, ShardIndex, ShardNumber}; +/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up +/// when full. +/// +/// TODO: tune all of these constants, and consider making them configurable. +/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels +/// with only streams. +const MAX_CLIENTS_PER_CHANNEL: NonZero = NonZero::new(16).unwrap(); + +/// Max number of concurrent unary request clients per shard. +const MAX_UNARY_CLIENTS: NonZero = NonZero::new(64).unwrap(); + +/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage +/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`. +const MAX_STREAMS: NonZero = NonZero::new(64).unwrap(); + +/// Max number of pipelined requests per stream. +const MAX_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(2).unwrap(); + +/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these +/// are more throughput-oriented, we have a smaller limit but higher queue depth. +const MAX_BULK_STREAMS: NonZero = NonZero::new(16).unwrap(); + +/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus +/// get a larger queue depth. +const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero = NonZero::new(4).unwrap(); + /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the /// basic `page_api::Client` gRPC client, and supports: /// @@ -87,6 +114,7 @@ impl PageserverClient { /// errors. All responses will have `GetPageStatusCode::Ok`. #[instrument(skip_all, fields( req_id = %req.request_id, + class = %req.request_class, rel = %req.rel, blkno = %req.block_numbers[0], blks = %req.block_numbers.len(), @@ -141,7 +169,11 @@ impl PageserverClient { let resp = self .retry .with(async || { - let stream = self.shards.get(shard_id)?.stream().await; + let stream = self + .shards + .get(shard_id)? + .stream(req.request_class.is_bulk()) + .await; let resp = stream.send(req.clone()).await?; // Convert per-request errors into a tonic::Status. @@ -270,17 +302,22 @@ impl Shards { } } -/// A single shard. +/// A single shard. Uses dedicated resource pools with the following structure: /// -/// TODO: consider separate pools for normal and bulk traffic, with different settings. +/// * Channel pool: unbounded. +/// * Unary client pool: MAX_UNARY_CLIENTS. +/// * Stream client pool: unbounded. +/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH. +/// * Bulk channel pool: unbounded. +/// * Bulk client pool: unbounded. +/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH. struct Shard { - /// Dedicated channel pool for this shard. Shared by all clients/streams in this shard. - _channel_pool: Arc, - /// Unary gRPC client pool for this shard. Uses the shared channel pool. + /// Unary gRPC client pool. client_pool: Arc, - /// GetPage stream pool for this shard. Uses a dedicated client pool, but shares the channel - /// pool with unary clients. + /// GetPage stream pool. stream_pool: Arc, + /// GetPage stream pool for bulk requests, e.g. prefetches. + bulk_stream_pool: Arc, } impl Shard { @@ -297,34 +334,53 @@ impl Shard { return Err(anyhow!("invalid shard URL {url}: must use gRPC")); } - // Use a common channel pool for all clients, to multiplex unary and stream requests across - // the same TCP connections. The channel pool is unbounded (but client pools are bounded). - let channel_pool = ChannelPool::new(url)?; + // Common channel pool for unary and stream requests. Bounded by client/stream pools. + let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?; - // Dedicated client pool for unary requests. + // Client pool for unary requests. let client_pool = ClientPool::new( channel_pool.clone(), tenant_id, timeline_id, shard_id, auth_token.clone(), + Some(MAX_UNARY_CLIENTS), ); - // Stream pool with dedicated client pool. If this shared a client pool with unary requests, - // long-lived streams could fill up the client pool and starve out unary requests. It shares - // the same underlying channel pool with unary clients though, which is unbounded. - let stream_pool = StreamPool::new(ClientPool::new( - channel_pool.clone(), - tenant_id, - timeline_id, - shard_id, - auth_token, - )); + // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients, + // but shares a channel pool with it (as it's unbounded). + let stream_pool = StreamPool::new( + ClientPool::new( + channel_pool.clone(), + tenant_id, + timeline_id, + shard_id, + auth_token.clone(), + None, // unbounded, limited by stream pool + ), + Some(MAX_STREAMS), + MAX_STREAM_QUEUE_DEPTH, + ); + + // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools + // to avoid head-of-line blocking of latency-sensitive requests. + let bulk_stream_pool = StreamPool::new( + ClientPool::new( + ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?, + tenant_id, + timeline_id, + shard_id, + auth_token, + None, // unbounded, limited by stream pool + ), + Some(MAX_BULK_STREAMS), + MAX_BULK_STREAM_QUEUE_DEPTH, + ); Ok(Self { - _channel_pool: channel_pool, client_pool, stream_pool, + bulk_stream_pool, }) } @@ -336,8 +392,12 @@ impl Shard { .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}"))) } - /// Returns a pooled stream for this shard. - async fn stream(&self) -> StreamGuard { - self.stream_pool.get().await + /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream + /// pool (e.g. for prefetches). + async fn stream(&self, bulk: bool) -> StreamGuard { + match bulk { + false => self.stream_pool.get().await, + true => self.bulk_stream_pool.get().await, + } } } diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs index 518e4e5b84..5a50004fd1 100644 --- a/pageserver/client_grpc/src/pool.rs +++ b/pageserver/client_grpc/src/pool.rs @@ -30,6 +30,7 @@ //! TODO: observability. use std::collections::{BTreeMap, HashMap}; +use std::num::NonZero; use std::ops::{Deref, DerefMut}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, Weak}; @@ -44,22 +45,10 @@ use pageserver_page_api as page_api; use utils::id::{TenantId, TimelineId}; use utils::shard::ShardIndex; -/// Max number of concurrent clients per channel. -/// -/// TODO: tune these constants, and make them configurable. -/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels -/// with only streams. -const CLIENTS_PER_CHANNEL: usize = 16; - -/// Maximum number of concurrent clients per `ClientPool`. -const CLIENT_LIMIT: usize = 64; - -/// Max number of pipelined requests per gRPC GetPage stream. -const STREAM_QUEUE_DEPTH: usize = 2; - /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2 -/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of -/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients. +/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this. +/// The pool does not limit the number of channels, and instead relies on `ClientPool` or +/// `StreamPool` to limit the number of concurrent clients. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. /// @@ -69,6 +58,8 @@ const STREAM_QUEUE_DEPTH: usize = 2; pub struct ChannelPool { /// Pageserver endpoint to connect to. endpoint: Endpoint, + /// Max number of clients per channel. Beyond this, a new channel will be created. + max_clients_per_channel: NonZero, /// Open channels. channels: Mutex>, /// Channel ID generator. @@ -86,13 +77,14 @@ struct ChannelEntry { impl ChannelPool { /// Creates a new channel pool for the given Pageserver endpoint. - pub fn new(endpoint: E) -> anyhow::Result> + pub fn new(endpoint: E, max_clients_per_channel: NonZero) -> anyhow::Result> where E: TryInto + Send + Sync + 'static, >::Error: std::error::Error + Send + Sync, { Ok(Arc::new(Self { endpoint: endpoint.try_into()?, + max_clients_per_channel, channels: Mutex::default(), next_channel_id: AtomicUsize::default(), })) @@ -120,8 +112,11 @@ impl ChannelPool { // with lower-ordered channel IDs first. This will cluster clients in lower-ordered // channels, and free up higher-ordered channels such that they can be reaped. for (&id, entry) in channels.iter_mut() { - assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow"); - if entry.clients < CLIENTS_PER_CHANNEL { + assert!( + entry.clients <= self.max_clients_per_channel.get(), + "channel overflow" + ); + if entry.clients < self.max_clients_per_channel.get() { entry.clients += 1; return ChannelGuard { pool: Arc::downgrade(self), @@ -181,7 +176,7 @@ impl Drop for ChannelGuard { /// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner /// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total -/// number of concurrent clients to `CLIENT_LIMIT` via semaphore. +/// number of concurrent clients to `max_clients` via semaphore. /// /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads. /// @@ -197,8 +192,8 @@ pub struct ClientPool { auth_token: Option, /// Channel pool to acquire channels from. channel_pool: Arc, - /// Limits the max number of concurrent clients for this pool. - limiter: Arc, + /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded. + limiter: Option>, /// Idle pooled clients. Acquired clients are removed from here and returned on drop. /// /// The first client in the map will be acquired next. The map is sorted by client ID, which in @@ -221,13 +216,15 @@ struct ClientEntry { impl ClientPool { /// Creates a new client pool for the given tenant shard. Channels are acquired from the given - /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. + /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to + /// `max_clients` concurrent clients, or unbounded if None. pub fn new( channel_pool: Arc, tenant_id: TenantId, timeline_id: TimelineId, shard_id: ShardIndex, auth_token: Option, + max_clients: Option>, ) -> Arc { Arc::new(Self { tenant_id, @@ -236,25 +233,24 @@ impl ClientPool { auth_token, channel_pool, idle: Mutex::default(), - limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)), + limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))), next_client_id: AtomicUsize::default(), }) } /// Gets a client from the pool, or creates a new one if necessary. Connections are established - /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The - /// client is returned to the pool when the guard is dropped. + /// lazily and do not block, but this call can block if the pool is at `max_clients`. The client + /// is returned to the pool when the guard is dropped. /// /// This is moderately performance-sensitive. It is called for every unary request, but these /// establish a new gRPC stream per request so they're already expensive. GetPage requests use /// the `StreamPool` instead. pub async fn get(self: &Arc) -> anyhow::Result { - let permit = self - .limiter - .clone() - .acquire_owned() - .await - .expect("never closed"); + // Acquire a permit if the pool is bounded. + let mut permit = None; + if let Some(limiter) = self.limiter.clone() { + permit = Some(limiter.acquire_owned().await.expect("never closed")); + } // Fast path: acquire an idle client from the pool. if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() { @@ -296,9 +292,9 @@ impl ClientPool { pub struct ClientGuard { pool: Weak, id: ClientID, - client: Option, // Some until dropped - channel_guard: Option, // Some until dropped - permit: OwnedSemaphorePermit, + client: Option, // Some until dropped + channel_guard: Option, // Some until dropped + permit: Option, // None if pool is unbounded } impl Deref for ClientGuard { @@ -341,16 +337,21 @@ impl Drop for ClientGuard { /// TODO: reap idle streams. /// TODO: consider making this generic over request and response types; not currently needed. pub struct StreamPool { - /// The client pool to acquire clients from. + /// The client pool to acquire clients from. Must be unbounded. client_pool: Arc, /// All pooled streams. /// /// Incoming requests will be sent over an existing stream with available capacity. If all - /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit). - /// Each stream has an associated Tokio task that processes requests and responses. + /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each + /// stream has an associated Tokio task that processes requests and responses. streams: Arc>>, - /// Limits the max number of concurrent requests (not streams). - limiter: Arc, + /// The max number of concurrent streams, or None if unbounded. + max_streams: Option>, + /// The max number of concurrent requests per stream. + max_queue_depth: NonZero, + /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`. + /// None if the pool is unbounded. + limiter: Option>, /// Stream ID generator. next_stream_id: AtomicUsize, } @@ -369,16 +370,27 @@ struct StreamEntry { } impl StreamPool { - /// Creates a new stream pool, using the given client pool. + /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth` + /// concurrent requests on each stream, and use up to `max_streams` concurrent streams. /// - /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may - /// fill up the client pool and starve out unary requests. Client pools can share the same - /// `ChannelPool` though, since the channel pool is unbounded. - pub fn new(client_pool: Arc) -> Arc { + /// The client pool must be unbounded. The stream pool will enforce its own limits, and because + /// streams are long-lived they can cause persistent starvation if they exhaust the client pool. + /// The stream pool should generally have its own dedicated client pool (but it can share a + /// channel pool with others since these are always unbounded). + pub fn new( + client_pool: Arc, + max_streams: Option>, + max_queue_depth: NonZero, + ) -> Arc { + assert!(client_pool.limiter.is_none(), "bounded client pool"); Arc::new(Self { client_pool, streams: Arc::default(), - limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)), + limiter: max_streams.map(|max_streams| { + Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get())) + }), + max_streams, + max_queue_depth, next_stream_id: AtomicUsize::default(), }) } @@ -402,18 +414,17 @@ impl StreamPool { /// /// For now, we just do something simple and functional, but very inefficient (linear scan). pub async fn get(&self) -> StreamGuard { - let permit = self - .limiter - .clone() - .acquire_owned() - .await - .expect("never closed"); + // Acquire a permit if the pool is bounded. + let mut permit = None; + if let Some(limiter) = self.limiter.clone() { + permit = Some(limiter.acquire_owned().await.expect("never closed")); + } let mut streams = self.streams.lock().unwrap(); // Look for a pooled stream with available capacity. for entry in streams.values() { assert!( - entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH, + entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(), "stream queue overflow" ); if entry @@ -421,7 +432,7 @@ impl StreamPool { .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| { // Increment the queue depth via compare-and-swap. // TODO: review ordering. - (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1) + (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1) }) .is_ok() { @@ -438,18 +449,16 @@ impl StreamPool { // join onto this stream and also create additional streams concurrently if this fills up. let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed); let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller - let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); + let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get()); let entry = StreamEntry { sender: req_tx.clone(), queue_depth: queue_depth.clone(), }; streams.insert(id, entry); - // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT * - // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more - // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen - // because we only acquire queue depth under lock and after acquiring a semaphore permit. - assert!(streams.len() <= CLIENT_LIMIT, "stream overflow"); + if let Some(max_streams) = self.max_streams { + assert!(streams.len() <= max_streams.get(), "stream overflow"); + }; let client_pool = self.client_pool.clone(); let streams = self.streams.clone(); @@ -484,19 +493,22 @@ impl StreamPool { // Acquire a client from the pool and create a stream. let mut client = client_pool.get().await?; - let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH); - let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx); + // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could + // theoretically deadlock if both the client and server block on sends (since we're not + // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and + // low queue depths, but it was seen to happen with the libpq protocol so better safe than + // sorry. It should never buffer more than the queue depth anyway, but using an unbounded + // channel guarantees that it will never block. + let (req_tx, req_rx) = mpsc::unbounded_channel(); + let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx); let mut resp_stream = client.get_pages(req_stream).await?; // Track caller response channels by request ID. If the task returns early, these response // channels will be dropped and the waiting callers will receive an error. - let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH); + let mut callers = HashMap::new(); // Process requests and responses. loop { - // NB: this can trip if the server doesn't respond to a request, so only debug_assert. - debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow"); - tokio::select! { // Receive requests from callers and send them to the stream. req = caller_rx.recv() => { @@ -515,8 +527,8 @@ impl StreamPool { } callers.insert(req.request_id, resp_tx); - // Send the request on the stream. Bail out if the send fails. - req_tx.send(req).await.map_err(|_| { + // Send the request on the stream. Bail out if the stream is closed. + req_tx.send(req).map_err(|_| { tonic::Status::unavailable("stream closed") })?; } @@ -545,7 +557,7 @@ impl StreamPool { pub struct StreamGuard { sender: RequestSender, queue_depth: Arc, - permit: OwnedSemaphorePermit, + permit: Option, // None if pool is unbounded } impl StreamGuard { diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index c5b6f06879..d0d3517d41 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -384,7 +384,7 @@ impl From for proto::GetPageRequest { pub type RequestID = u64; /// A GetPage request class. -#[derive(Clone, Copy, Debug)] +#[derive(Clone, Copy, Debug, strum_macros::Display)] pub enum GetPageClass { /// Unknown class. For backwards compatibility: used when an older client version sends a class /// that a newer server version has removed. @@ -397,6 +397,19 @@ pub enum GetPageClass { Background, } +impl GetPageClass { + /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than + /// latency-sensitive). + pub fn is_bulk(&self) -> bool { + match self { + Self::Unknown => false, + Self::Normal => false, + Self::Prefetch => true, + Self::Background => true, + } + } +} + impl From for GetPageClass { fn from(pb: proto::GetPageClass) -> Self { match pb { From 12c26243fc5e8a4522a8f848923f6e3eb7e0ad7c Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 9 Jul 2025 11:47:21 -0500 Subject: [PATCH 28/28] Fix typo in migration testing related to pg_monitor (#12530) We should be joining on the neon_superuser roleid, not the pg_monitor roleid. Signed-off-by: Tristan Partin --- .../tests/0004-grant_pg_monitor_to_neon_superuser.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql index acb8dd417d..deb7a364af 100644 --- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql +++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql @@ -6,7 +6,7 @@ BEGIN admin_option AS admin INTO monitor FROM pg_auth_members - WHERE roleid = 'pg_monitor'::regrole + WHERE roleid = 'neon_superuser'::regrole AND member = 'pg_monitor'::regrole; IF NOT monitor.member THEN