From 68a175d5455e0dbd01a014a89c5f289ab9777f0d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 23 Jun 2025 17:33:45 +0200 Subject: [PATCH 01/50] test_runner: fix `test_basebackup_with_high_slru_count` gzip param (#12319) The `--gzip-probability` parameter was removed in #12250. However, `test_basebackup_with_high_slru_count` still uses it, and keeps failing. This patch removes the use of the parameter (gzip is enabled by default). --- .../pageserver/pagebench/test_large_slru_basebackup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 8af52dcbd0..25dfd5277c 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -146,8 +146,6 @@ def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), - "--gzip-probability", - "1", "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them From 6c3aba7c44e070a25064b113651b934cb7460e67 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 23 Jun 2025 08:50:31 -0700 Subject: [PATCH 02/50] storcon: adjust AZ selection for heterogenous AZs (#12296) ## Problem The scheduler uses total shards per AZ to select the AZ for newly created or attached tenants. This makes bad decisions when we have different node counts per AZ -- we might have 2 very busy pageservers in one AZ, and 4 more lightly loaded pageservers in other AZs, and the scheduler picks the busy pageservers because the total shard count in their AZ is lower. ## Summary of changes - Divide the shard count by the number of nodes in the AZ when scoring in `get_az_for_new_tenant` --------- Co-authored-by: John Spray --- storage_controller/src/scheduler.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3fa25443da..b3656c33d4 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -825,6 +825,7 @@ impl Scheduler { struct AzScore { home_shard_count: usize, scheduleable: bool, + node_count: usize, } let mut azs: HashMap<&AvailabilityZone, AzScore> = HashMap::new(); @@ -832,6 +833,7 @@ impl Scheduler { let az = azs.entry(&node.az).or_default(); az.home_shard_count += node.home_shard_count; az.scheduleable |= matches!(node.may_schedule, MaySchedule::Yes(_)); + az.node_count += 1; } // If any AZs are schedulable, then filter out the non-schedulable ones (i.e. AZs where @@ -840,10 +842,20 @@ impl Scheduler { azs.retain(|_, i| i.scheduleable); } + // We will multiply up shard counts by the max node count for scoring, before dividing + // by per-node max node count, to get a normalized score that doesn't collapse to zero + // when the absolute shard count is less than the node count. + let max_node_count = azs.values().map(|i| i.node_count).max().unwrap_or(0); + // Find the AZ with the lowest number of shards currently allocated Some( azs.into_iter() - .min_by_key(|i| (i.1.home_shard_count, i.0)) + .min_by_key(|i| { + ( + (i.1.home_shard_count * max_node_count) / i.1.node_count, + i.0, + ) + }) .unwrap() .0 .clone(), From 85164422d0c70ff6cb9ff3219db035b426d0d78b Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 23 Jun 2025 13:31:53 -0400 Subject: [PATCH 03/50] feat(pageserver): support force overriding feature flags (#12233) ## Problem Part of #11813 ## Summary of changes Add a test API to make it easier to manipulate the feature flags within tests. --------- Signed-off-by: Alex Chi Z --- pageserver/src/bin/pageserver.rs | 3 +- pageserver/src/feature_resolver.rs | 35 +++++++++++++++- pageserver/src/http/routes.rs | 49 ++++++++++++++++++++--- test_runner/fixtures/pageserver/http.py | 28 +++++++++++++ test_runner/regress/test_feature_flag.py | 51 ++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 test_runner/regress/test_feature_flag.py diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 417503089a..d137d651eb 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -583,7 +583,7 @@ fn start_pageserver( deletion_queue_client, l0_flush_global_state, basebackup_prepare_sender, - feature_resolver, + feature_resolver: feature_resolver.clone(), }, shutdown_pageserver.clone(), ); @@ -715,6 +715,7 @@ fn start_pageserver( disk_usage_eviction_state, deletion_queue.new_client(), secondary_controller, + feature_resolver, ) .context("Failed to initialize router state")?, ); diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index b4e6f78bf2..b0a68dfc4d 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -1,5 +1,6 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; +use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; use posthog_client_lite::{ CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, @@ -18,6 +19,7 @@ const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600); pub struct FeatureResolver { inner: Option>, internal_properties: Option>>, + force_overrides_for_testing: Arc>>, } impl FeatureResolver { @@ -25,6 +27,7 @@ impl FeatureResolver { Self { inner: None, internal_properties: None, + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), } } @@ -151,11 +154,13 @@ impl FeatureResolver { Ok(FeatureResolver { inner: Some(inner), internal_properties: Some(internal_properties), + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } else { Ok(FeatureResolver { inner: None, internal_properties: None, + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))), }) } } @@ -195,6 +200,11 @@ impl FeatureResolver { flag_key: &str, tenant_id: TenantId, ) -> Result { + let force_overrides = self.force_overrides_for_testing.load(); + if let Some(value) = force_overrides.get(flag_key) { + return Ok(value.clone()); + } + if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_multivariate( flag_key, @@ -233,6 +243,15 @@ impl FeatureResolver { flag_key: &str, tenant_id: TenantId, ) -> Result<(), PostHogEvaluationError> { + let force_overrides = self.force_overrides_for_testing.load(); + if let Some(value) = force_overrides.get(flag_key) { + return if value == "true" { + Ok(()) + } else { + Err(PostHogEvaluationError::NoConditionGroupMatched) + }; + } + if let Some(inner) = &self.inner { let res = inner.feature_store().evaluate_boolean( flag_key, @@ -264,8 +283,22 @@ impl FeatureResolver { inner.feature_store().is_feature_flag_boolean(flag_key) } else { Err(PostHogEvaluationError::NotAvailable( - "PostHog integration is not enabled".to_string(), + "PostHog integration is not enabled, cannot auto-determine the flag type" + .to_string(), )) } } + + /// Force override a feature flag for testing. This is only for testing purposes. Assume the caller only call it + /// from a single thread so it won't race. + pub fn force_override_for_testing(&self, flag_key: &str, value: Option<&str>) { + let mut force_overrides = self.force_overrides_for_testing.load().as_ref().clone(); + if let Some(value) = value { + force_overrides.insert(flag_key.to_string(), value.to_string()); + } else { + force_overrides.remove(flag_key); + } + self.force_overrides_for_testing + .store(Arc::new(force_overrides)); + } } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 626986f580..2e9cd3ad70 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -59,6 +59,7 @@ use crate::config::PageServerConf; use crate::context; use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::deletion_queue::DeletionQueueClient; +use crate::feature_resolver::FeatureResolver; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationConf; @@ -107,6 +108,7 @@ pub struct State { deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, latest_utilization: tokio::sync::Mutex>, + feature_resolver: FeatureResolver, } impl State { @@ -120,6 +122,7 @@ impl State { disk_usage_eviction_state: Arc, deletion_queue_client: DeletionQueueClient, secondary_controller: SecondaryController, + feature_resolver: FeatureResolver, ) -> anyhow::Result { let allowlist_routes = &[ "/v1/status", @@ -140,6 +143,7 @@ impl State { deletion_queue_client, secondary_controller, latest_utilization: Default::default(), + feature_resolver, }) } } @@ -3675,8 +3679,8 @@ async fn tenant_evaluate_feature_flag( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let flag: String = must_parse_query_param(&request, "flag")?; - let as_type: String = must_parse_query_param(&request, "as")?; + let flag: String = parse_request_param(&request, "flag_key")?; + let as_type: Option = parse_query_param(&request, "as")?; let state = get_state(&request); @@ -3685,11 +3689,11 @@ async fn tenant_evaluate_feature_flag( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id); - if as_type == "boolean" { + if as_type.as_deref() == Some("boolean") { let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id); let result = result.map(|_| true).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) - } else if as_type == "multivariate" { + } else if as_type.as_deref() == Some("multivariate") { let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string()); json_response(StatusCode::OK, json!({ "result": result, "properties": properties })) } else { @@ -3709,6 +3713,35 @@ async fn tenant_evaluate_feature_flag( .await } +async fn force_override_feature_flag_for_testing_put( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let flag: String = parse_request_param(&request, "flag_key")?; + let value: String = must_parse_query_param(&request, "value")?; + let state = get_state(&request); + state + .feature_resolver + .force_override_for_testing(&flag, Some(&value)); + json_response(StatusCode::OK, ()) +} + +async fn force_override_feature_flag_for_testing_delete( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + + let flag: String = parse_request_param(&request, "flag_key")?; + let state = get_state(&request); + state + .feature_resolver + .force_override_for_testing(&flag, None); + json_response(StatusCode::OK, ()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -4085,8 +4118,14 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", |r| api_handler(r, activate_post_import_handler), ) - .get("/v1/tenant/:tenant_shard_id/feature_flag", |r| { + .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| { api_handler(r, tenant_evaluate_feature_flag) }) + .put("/v1/feature_flag/:flag_key", |r| { + testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put) + }) + .delete("/v1/feature_flag/:flag_key", |r| { + testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete) + }) .any(handler_404)) } diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index c29192c25c..d9037f2d08 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1219,3 +1219,31 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) return res.json() + + def force_override_feature_flag(self, flag: str, value: str | None = None): + if value is None: + res = self.delete( + f"http://localhost:{self.port}/v1/feature_flag/{flag}", + ) + else: + res = self.put( + f"http://localhost:{self.port}/v1/feature_flag/{flag}", + params={"value": value}, + ) + self.verbose_error(res) + + def evaluate_feature_flag_boolean(self, tenant_id: TenantId, flag: str) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}", + params={"as": "boolean"}, + ) + self.verbose_error(res) + return res.json() + + def evaluate_feature_flag_multivariate(self, tenant_id: TenantId, flag: str) -> Any: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/feature_flag/{flag}", + params={"as": "multivariate"}, + ) + self.verbose_error(res) + return res.json() diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py new file mode 100644 index 0000000000..2712d13dcc --- /dev/null +++ b/test_runner/regress/test_feature_flag.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from fixtures.utils import run_only_on_default_postgres + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +@run_only_on_default_postgres("Pageserver-only test only needs to run on one version") +def test_feature_flag(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "true") + assert env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + assert ( + env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + == "true" + ) + + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", "false") + assert ( + env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"]["Err"] + == "No condition group is matched" + ) + assert ( + env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"]["Ok"] + == "false" + ) + + env.pageserver.http_client().force_override_feature_flag("test-feature-flag", None) + assert ( + "Err" + in env.pageserver.http_client().evaluate_feature_flag_boolean( + env.initial_tenant, "test-feature-flag" + )["result"] + ) + assert ( + "Err" + in env.pageserver.http_client().evaluate_feature_flag_multivariate( + env.initial_tenant, "test-feature-flag" + )["result"] + ) From 5eecde461d407e0fb63e911d5569d1cd1ffc0a9b Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Mon, 23 Jun 2025 22:55:26 +0400 Subject: [PATCH 04/50] storcon: Fix migration for Attached(0) tenants (#12256) ## Problem `Attached(0)` tenant migrations can get stuck if the heatmap file has not been uploaded. ## Summary of Changes - Added a test to reproduce the issue. - Introduced a `kick_secondary_downloads` config flag: - Enabled in testing environments. - Disabled in production (and in the new test). - Updated `Attached(0)` locations to consider the number of secondaries in their intent when deciding whether to download the heatmap. --- control_plane/src/local_env.rs | 3 ++ control_plane/src/storage_controller.rs | 4 ++ storage_controller/src/main.rs | 12 +++++ storage_controller/src/reconciler.rs | 7 +-- storage_controller/src/service.rs | 21 ++++++++- storage_controller/src/tenant_shard.rs | 9 +++- test_runner/fixtures/neon_fixtures.py | 11 +++++ .../regress/test_storage_controller.py | 47 +++++++++++++++++++ 8 files changed, 108 insertions(+), 6 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 387fc297f0..e8abde4901 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -211,6 +211,8 @@ pub struct NeonStorageControllerConf { pub use_local_compute_notifications: bool, pub timeline_safekeeper_count: Option, + + pub kick_secondary_downloads: Option, } impl NeonStorageControllerConf { @@ -242,6 +244,7 @@ impl Default for NeonStorageControllerConf { use_https_safekeeper_api: false, use_local_compute_notifications: true, timeline_safekeeper_count: None, + kick_secondary_downloads: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 95f7533057..334949924c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -557,6 +557,10 @@ impl StorageController { args.push("--use-local-compute-notifications".to_string()); } + if let Some(value) = self.config.kick_secondary_downloads { + args.push(format!("--kick-secondary-downloads={value}")); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index fc0ba9f28c..ff134a4ebc 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -5,6 +5,9 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8PathBuf; + +#[cfg(feature = "testing")] +use clap::ArgAction; use clap::Parser; use futures::future::OptionFuture; use http_utils::tls_certs::ReloadingCertificateResolver; @@ -213,6 +216,13 @@ struct Cli { /// This option exists primarily for testing purposes. #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] timeline_safekeeper_count: i64, + + /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. + /// This speed up migrations by avoiding the default wait for the heatmap download interval. + /// Primarily useful for testing to reduce test execution time. + #[cfg(feature = "testing")] + #[arg(long, default_value = "true", action=ArgAction::Set)] + kick_secondary_downloads: bool, } enum StrictMode { @@ -445,6 +455,8 @@ async fn async_main() -> anyhow::Result<()> { timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, + #[cfg(feature = "testing")] + kick_secondary_downloads: args.kick_secondary_downloads, }; // Validate that we can connect to the database diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index b03a6dae04..92844c9c7b 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -856,6 +856,7 @@ impl Reconciler { &self.shard, &self.config, &self.placement_policy, + self.intent.secondary.len(), ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { @@ -1235,11 +1236,11 @@ pub(crate) fn attached_location_conf( shard: &ShardIdentity, config: &TenantConfig, policy: &PlacementPolicy, + secondary_count: usize, ) -> LocationConfig { let has_secondaries = match policy { - PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => { - false - } + PlacementPolicy::Detached | PlacementPolicy::Secondary => false, + PlacementPolicy::Attached(0) => secondary_count > 0, PlacementPolicy::Attached(_) => true, }; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 6ec3963c48..0eb87ffbe3 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -470,6 +470,9 @@ pub struct Config { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. pub timeline_safekeeper_count: i64, + + #[cfg(feature = "testing")] + pub kick_secondary_downloads: bool, } impl From for ApiError { @@ -2064,6 +2067,7 @@ impl Service { &tenant_shard.shard, &tenant_shard.config, &PlacementPolicy::Attached(0), + tenant_shard.intent.get_secondary().len(), )), }, )]); @@ -5605,7 +5609,15 @@ impl Service { for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); - let (pageserver, generation, policy, parent_ident, config, preferred_az) = { + let ( + pageserver, + generation, + policy, + parent_ident, + config, + preferred_az, + secondary_count, + ) = { let mut old_state = tenants .remove(&parent_id) .expect("It was present, we just split it"); @@ -5625,6 +5637,7 @@ impl Service { old_state.shard, old_state.config.clone(), old_state.preferred_az().cloned(), + old_state.intent.get_secondary().len(), ) }; @@ -5646,6 +5659,7 @@ impl Service { &child_shard, &config, &policy, + secondary_count, )), }, ); @@ -8373,6 +8387,11 @@ impl Service { /// we have this helper to move things along faster. #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { + if !self.config.kick_secondary_downloads { + // No-op if kick_secondary_downloads functionaliuty is not configured + return; + } + let (attached_node, secondaries) = { let locked = self.inner.read().unwrap(); let Some(shard) = locked.tenants.get(&tenant_shard_id) else { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index acd18734cf..789327bfaf 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1381,8 +1381,13 @@ impl TenantShard { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = - attached_location_conf(generation, &self.shard, &self.config, &self.policy); + let wanted_conf = attached_location_conf( + generation, + &self.shard, + &self.config, + &self.policy, + self.intent.get_secondary().len(), + ); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 050d61055e..4eb85119ca 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -453,6 +453,7 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, + storcon_kick_secondary_downloads: bool | None = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -514,6 +515,8 @@ class NeonEnvBuilder: self.pageserver_tracing_config = pageserver_tracing_config self.pageserver_import_config = pageserver_import_config + self.storcon_kick_secondary_downloads = storcon_kick_secondary_downloads + self.pageserver_default_tenant_config_compaction_algorithm: dict[str, Any] | None = ( pageserver_default_tenant_config_compaction_algorithm ) @@ -1221,6 +1224,14 @@ class NeonEnv: else: cfg["storage_controller"] = {"use_local_compute_notifications": False} + if config.storcon_kick_secondary_downloads is not None: + # Configure whether storage controller should actively kick off secondary downloads + if "storage_controller" not in cfg: + cfg["storage_controller"] = {} + cfg["storage_controller"]["kick_secondary_downloads"] = ( + config.storcon_kick_secondary_downloads + ) + # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" pg_auth_type = "NeonJWT" if config.auth_enabled else "Trust" diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 74ba74645e..be7f0c8a3e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -4434,6 +4434,53 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder, assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == [] +def test_attached_0_graceful_migration(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_pageservers = 4 + neon_env_builder.num_azs = 2 + + neon_env_builder.storcon_kick_secondary_downloads = False + + env = neon_env_builder.init_start() + + # It is default, but we want to ensure that there are no secondary locations requested + env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 0}}) + env.storage_controller.reconcile_until_idle() + + desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0] + src_ps_id = desc["node_attached"] + src_ps = env.get_pageserver(src_ps_id) + src_az = desc["preferred_az_id"] + + # There must be no secondary locations with Attached(0) placement policy + assert len(desc["node_secondary"]) == 0 + + # Migrate tenant shard to the same AZ node + dst_ps = [ps for ps in env.pageservers if ps.id != src_ps_id and ps.az_id == src_az][0] + + env.storage_controller.tenant_shard_migrate( + TenantShardId(env.initial_tenant, 0, 0), + dst_ps.id, + config=StorageControllerMigrationConfig(prewarm=True), + ) + + def tenant_shard_migrated(): + src_locations = src_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(src_locations) == 0 + log.info(f"Tenant shard migrated from {src_ps.id}") + dst_locations = dst_ps.http_client().tenant_list_locations()["tenant_shards"] + assert len(dst_locations) == 1 + assert dst_locations[0][1]["mode"] == "AttachedSingle" + log.info(f"Tenant shard migrated to {dst_ps.id}") + + # After all we expect that tenant shard exists only on dst node. + # We wait so long because [`DEFAULT_HEATMAP_PERIOD`] and [`DEFAULT_DOWNLOAD_INTERVAL`] + # are set to 60 seconds by default. + # + # TODO: we should consider making these configurable, so the test can run faster. + wait_until(tenant_shard_migrated, timeout=180, interval=5, status_interval=10) + log.info("Tenant shard migrated successfully") + + @run_only_on_default_postgres("this is like a 'unit test' against storcon db") def test_storage_controller_migrate_with_pageserver_restart( neon_env_builder: NeonEnvBuilder, make_httpserver From 0efff1db26d14000278e409586f580493e31289d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Jun 2025 00:26:38 +0200 Subject: [PATCH 05/50] Allow cancellation errors in tests that allow timeline deletion errors (#12315) After merging of PR https://github.com/neondatabase/neon/pull/11712 we saw some tests be flaky, with errors showing up about the timeline having been cancelled instead of having been deleted. This is an outcome that is inherently racy with the "has been deleted" error. In some instances, https://github.com/neondatabase/neon/pull/11712 has already added the error about the timeline having been cancelled. This PR adds them to the remaining instances of https://github.com/neondatabase/neon/pull/11712, fixing the flakiness. --- test_runner/regress/test_storage_controller.py | 4 +++- test_runner/regress/test_timeline_detach_ancestor.py | 8 ++++++-- test_runner/regress/test_timeline_gc_blocking.py | 4 +++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index be7f0c8a3e..70772766d7 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3642,7 +3642,9 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi env.start() for ps in env.pageservers: - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) tenant_id = TenantId.generate() timeline_id = TimelineId.generate() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index c58f78aeb1..b5cc431afe 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1099,7 +1099,9 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) pageservers = dict((int(p.id), p) for p in env.pageservers) @@ -1221,7 +1223,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) pageservers = dict((int(p.id), p) for p in env.pageservers) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 8ef64a0742..daba8019b6 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -25,7 +25,9 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool initial_tenant_shard_count=2 if sharded else None, ) for ps in env.pageservers: - ps.allowed_errors.append(".*Timeline.* has been deleted.*") + ps.allowed_errors.extend( + [".*Timeline.* has been deleted.*", ".*Timeline.*was cancelled and cannot be used"] + ) if sharded: http = env.storage_controller.pageserver_api() From a29772bf6eb4b43ad6d7a568a1356d4357e82bd8 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Tue, 24 Jun 2025 11:54:43 +0200 Subject: [PATCH 06/50] Create proxy-bench periodic run in CI (#12242) Currently run for test only via pushing to the test-proxy-bench branch. Relates to the #22681 --- .github/workflows/proxy-benchmark.yml | 83 ++++++++++++ scripts/ingest_perf_test_result.py | 2 +- scripts/proxy_bench_results_ingest.py | 187 ++++++++++++++++++++++++++ 3 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/proxy-benchmark.yml create mode 100644 scripts/proxy_bench_results_ingest.py diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml new file mode 100644 index 0000000000..75ecacaced --- /dev/null +++ b/.github/workflows/proxy-benchmark.yml @@ -0,0 +1,83 @@ +name: Periodic proxy performance test on unit-perf hetzner runner + +on: + push: # TODO: remove after testing + branches: + - test-proxy-bench # Runs on pushes to branches starting with test-proxy-bench + # schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # - cron: '0 5 * * *' # Runs at 5 UTC once a day + workflow_dispatch: # adds an ability to run this manually + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read + +jobs: + run_periodic_proxybench_test: + permissions: + id-token: write # aws-actions/configure-aws-credentials + statuses: write + contents: write + pull-requests: write + runs-on: [self-hosted, unit-perf] + timeout-minutes: 60 # 1h timeout + container: + image: ghcr.io/neondatabase/build-tools:pinned-bookworm + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --init + steps: + - name: Checkout proxy-bench Repo + uses: actions/checkout@v4 + with: + repository: neondatabase/proxy-bench + path: proxy-bench + + - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive + id: set-env + shell: bash -euxo pipefail {0} + run: | + PROXY_BENCH_PATH=$(realpath ./proxy-bench) + { + echo "PROXY_BENCH_PATH=$PROXY_BENCH_PATH" + echo "NEON_DIR=${RUNNER_TEMP}/neon" + echo "TEST_OUTPUT=${PROXY_BENCH_PATH}/test_output" + echo "" + } >> "$GITHUB_ENV" + + - name: Run proxy-bench + run: ./${PROXY_BENCH_PATH}/run.sh + + - name: Ingest Bench Results # neon repo script + if: success() + run: | + mkdir -p $TEST_OUTPUT + python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT + + - name: Push Metrics to Proxy perf database + if: success() + env: + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" + REPORT_FROM: $TEST_OUTPUT + run: $NEON_DIR/scripts/generate_and_push_perf_report.sh + + - name: Docker cleanup + run: docker compose down + + - name: Notify Failure + if: failure() + run: echo "Proxy bench job failed" && exit 1 \ No newline at end of file diff --git a/scripts/ingest_perf_test_result.py b/scripts/ingest_perf_test_result.py index 804f8a3cde..898e1ee954 100644 --- a/scripts/ingest_perf_test_result.py +++ b/scripts/ingest_perf_test_result.py @@ -26,7 +26,7 @@ CREATE TABLE IF NOT EXISTS perf_test_results ( metric_unit VARCHAR(10), metric_report_type TEXT, recorded_at_timestamp TIMESTAMP WITH TIME ZONE DEFAULT NOW(), - labels JSONB with default '{}' + labels JSONB DEFAULT '{}'::jsonb ) """ diff --git a/scripts/proxy_bench_results_ingest.py b/scripts/proxy_bench_results_ingest.py new file mode 100644 index 0000000000..475d053ed2 --- /dev/null +++ b/scripts/proxy_bench_results_ingest.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +import argparse +import json +import time +from typing import Any, TypedDict, cast + +import requests + +PROMETHEUS_URL = "http://localhost:9090" +SAMPLE_INTERVAL = 1 # seconds + +DEFAULT_REVISION = "unknown" +DEFAULT_PLATFORM = "unknown" +DEFAULT_SUIT = "proxy_bench" + + +class MetricConfig(TypedDict, total=False): + name: str + promql: str + unit: str + report: str + labels: dict[str, str] + is_vector: bool + label_field: str + + +METRICS: list[MetricConfig] = [ + { + "name": "latency_p99", + "promql": 'histogram_quantile(0.99, sum(rate(proxy_compute_connection_latency_seconds_bucket{outcome="success", excluded="client_and_cplane"}[5m])) by (le))', + "unit": "s", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "error_rate", + "promql": 'sum(rate(proxy_errors_total{type!~"user|clientdisconnect|quota"}[5m])) / sum(rate(proxy_accepted_connections_total[5m]))', + "unit": "", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "max_memory_kb", + "promql": "max(libmetrics_maxrss_kb)", + "unit": "kB", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "jemalloc_active_bytes", + "promql": "sum(jemalloc_active_bytes)", + "unit": "bytes", + "report": "LOWER_IS_BETTER", + "labels": {}, + }, + { + "name": "open_connections", + "promql": "sum by (protocol) (proxy_opened_client_connections_total - proxy_closed_client_connections_total)", + "unit": "", + "report": "HIGHER_IS_BETTER", + "labels": {}, + "is_vector": True, + "label_field": "protocol", + }, +] + + +class PrometheusMetric(TypedDict): + metric: dict[str, str] + value: list[str | float] + + +class PrometheusResult(TypedDict): + result: list[PrometheusMetric] + + +class PrometheusResponse(TypedDict): + data: PrometheusResult + + +def query_prometheus(promql: str) -> PrometheusResponse: + resp = requests.get(f"{PROMETHEUS_URL}/api/v1/query", params={"query": promql}) + resp.raise_for_status() + return cast("PrometheusResponse", resp.json()) + + +def extract_scalar_metric(result_json: PrometheusResponse) -> float | None: + try: + return float(result_json["data"]["result"][0]["value"][1]) + except (IndexError, KeyError, ValueError, TypeError): + return None + + +def extract_vector_metric( + result_json: PrometheusResponse, label_field: str +) -> list[tuple[str | None, float, dict[str, str]]]: + out: list[tuple[str | None, float, dict[str, str]]] = [] + for entry in result_json["data"]["result"]: + try: + value_str = entry["value"][1] + if not isinstance(value_str, (str | float)): + continue + value = float(value_str) + except (IndexError, KeyError, ValueError, TypeError): + continue + labels = entry.get("metric", {}) + label_val = labels.get(label_field, None) + out.append((label_val, value, labels)) + return out + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Collect Prometheus metrics and output in benchmark fixture format" + ) + parser.add_argument("--revision", default=DEFAULT_REVISION) + parser.add_argument("--platform", default=DEFAULT_PLATFORM) + parser.add_argument("--suit", default=DEFAULT_SUIT) + parser.add_argument("--out", default="metrics_benchmarks.json", help="Output JSON file") + parser.add_argument( + "--interval", default=SAMPLE_INTERVAL, type=int, help="Sampling interval (s)" + ) + args = parser.parse_args() + + start_time = int(time.time()) + samples: list[dict[str, Any]] = [] + + print("Collecting metrics (Ctrl+C to stop)...") + try: + while True: + ts = int(time.time()) + for metric in METRICS: + if metric.get("is_vector", False): + # Vector (per-label, e.g. per-protocol) + for label_val, value, labels in extract_vector_metric( + query_prometheus(metric["promql"]), metric["label_field"] + ): + entry = { + "name": f"{metric['name']}.{label_val}" + if label_val + else metric["name"], + "value": value, + "unit": metric["unit"], + "report": metric["report"], + "labels": {**metric.get("labels", {}), **labels}, + "timestamp": ts, + } + samples.append(entry) + else: + result = extract_scalar_metric(query_prometheus(metric["promql"])) + if result is not None: + entry = { + "name": metric["name"], + "value": result, + "unit": metric["unit"], + "report": metric["report"], + "labels": metric.get("labels", {}), + "timestamp": ts, + } + samples.append(entry) + time.sleep(args.interval) + except KeyboardInterrupt: + print("Collection stopped.") + + total_duration = int(time.time()) - start_time + + # Compose output + out = { + "revision": args.revision, + "platform": args.platform, + "result": [ + { + "suit": args.suit, + "total_duration": total_duration, + "data": samples, + } + ], + } + + with open(args.out, "w") as f: + json.dump(out, f, indent=2) + print(f"Wrote metrics in fixture format to {args.out}") + + +if __name__ == "__main__": + main() From 552249607da32a4c1c25a65339ce3b4533f6a728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 24 Jun 2025 12:12:42 +0200 Subject: [PATCH 07/50] apply clippy fixes for 1.88.0 beta (#12331) The 1.88.0 stable release is near (this Thursday). We'd like to fix most warnings beforehand so that the compiler upgrade doesn't require approval from too many teams. This is therefore a preparation PR (like similar PRs before it). There is a lot of changes for this release, mostly because the `uninlined_format_args` lint has been added to the `style` lint group. One can read more about the lint [here](https://rust-lang.github.io/rust-clippy/master/#/uninlined_format_args). The PR is the result of `cargo +beta clippy --fix` and `cargo fmt`. One remaining warning is left for the proxy team. --------- Co-authored-by: Conrad Ludgate --- compute_tools/src/bin/fast_import.rs | 6 +- compute_tools/src/bin/fast_import/s3_uri.rs | 2 +- compute_tools/src/catalog.rs | 4 +- compute_tools/src/compute.rs | 21 ++-- compute_tools/src/config.rs | 18 ++-- compute_tools/src/extension_server.rs | 12 +-- compute_tools/src/http/routes/configure.rs | 2 +- compute_tools/src/installed_extensions.rs | 4 +- compute_tools/src/lsn_lease.rs | 2 +- compute_tools/src/pg_helpers.rs | 10 +- compute_tools/src/spec.rs | 8 +- compute_tools/src/spec_apply.rs | 95 +++++++++---------- compute_tools/src/sync_sk.rs | 2 +- control_plane/src/bin/neon_local.rs | 20 ++-- control_plane/src/endpoint.rs | 11 +-- control_plane/src/local_env.rs | 6 +- control_plane/src/pageserver.rs | 2 +- control_plane/src/safekeeper.rs | 2 +- control_plane/src/storage_controller.rs | 16 ++-- control_plane/storcon_cli/src/main.rs | 28 +++--- endpoint_storage/src/app.rs | 2 +- libs/desim/src/executor.rs | 2 +- libs/desim/src/proto.rs | 4 +- libs/http-utils/src/endpoint.rs | 8 +- libs/neon-shmem/src/lib.rs | 4 +- libs/pageserver_api/src/controller_api.rs | 3 +- libs/pageserver_api/src/keyspace.rs | 5 +- libs/pageserver_api/src/models.rs | 5 +- libs/postgres_backend/src/lib.rs | 2 +- libs/postgres_backend/tests/simple_select.rs | 4 +- libs/postgres_connection/src/lib.rs | 6 +- libs/postgres_ffi/src/waldecoder_handler.rs | 8 +- libs/postgres_ffi/src/walrecord.rs | 8 +- .../wal_craft/src/xlog_utils_test.rs | 2 +- libs/postgres_initdb/src/lib.rs | 6 +- libs/posthog_client_lite/src/lib.rs | 45 +++------ libs/pq_proto/src/lib.rs | 8 +- .../src/authentication/sasl.rs | 10 +- .../postgres-protocol2/src/message/backend.rs | 4 +- libs/proxy/postgres-types2/src/lib.rs | 2 +- libs/proxy/tokio-postgres2/src/error/mod.rs | 12 +-- libs/proxy/tokio-postgres2/src/row.rs | 4 +- libs/remote_storage/src/local_fs.rs | 14 +-- libs/safekeeper_api/src/membership.rs | 4 +- libs/utils/src/error.rs | 2 +- libs/utils/src/generation.rs | 2 +- libs/utils/src/id.rs | 2 +- libs/utils/src/postgres_client.rs | 2 +- libs/utils/src/shard.rs | 4 +- libs/utils/src/signals.rs | 2 +- libs/vm_monitor/src/dispatcher.rs | 3 +- libs/vm_monitor/src/filecache.rs | 2 +- .../benches/bench_interpret_wal.rs | 4 +- libs/wal_decoder/build.rs | 2 +- libs/wal_decoder/src/models/record.rs | 2 +- libs/walproposer/src/api_bindings.rs | 4 +- libs/walproposer/src/walproposer.rs | 13 +-- pageserver/client/src/mgmt_api.rs | 4 +- pageserver/compaction/src/simulator/draw.rs | 6 +- pageserver/ctl/src/draw_timeline_dir.rs | 10 +- pageserver/ctl/src/key.rs | 2 +- pageserver/ctl/src/layer_map_analyzer.rs | 3 +- pageserver/page_api/src/client.rs | 2 +- pageserver/pagebench/src/cmd/aux_files.rs | 6 +- pageserver/src/basebackup.rs | 12 +-- pageserver/src/http/routes.rs | 10 +- pageserver/src/metrics.rs | 7 +- pageserver/src/page_service.rs | 9 +- pageserver/src/pgdatadir_mapping.rs | 5 +- pageserver/src/tenant.rs | 55 +++++------ pageserver/src/tenant/checks.rs | 3 +- pageserver/src/tenant/metadata.rs | 3 +- pageserver/src/tenant/secondary/downloader.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 4 +- .../src/tenant/storage_layer/image_layer.rs | 7 +- .../tenant/storage_layer/inmemory_layer.rs | 2 +- .../inmemory_layer/vectored_dio_read.rs | 2 +- pageserver/src/tenant/storage_layer/layer.rs | 2 +- pageserver/src/tenant/timeline.rs | 18 ++-- pageserver/src/tenant/timeline/compaction.rs | 10 +- .../walreceiver/walreceiver_connection.rs | 3 +- .../owned_buffers_io/aligned_buffer/buffer.rs | 11 +-- .../aligned_buffer/buffer_mut.rs | 5 +- pageserver/src/walingest.rs | 12 +-- pageserver/src/walredo/apply_neon.rs | 50 +++------- proxy/src/lib.rs | 4 + safekeeper/src/control_file.rs | 10 +- safekeeper/src/handler.rs | 16 ++-- safekeeper/src/safekeeper.rs | 26 +++-- safekeeper/src/timeline_eviction.rs | 7 +- safekeeper/src/timeline_manager.rs | 2 +- safekeeper/src/timelines_global_map.rs | 4 +- safekeeper/src/wal_backup_partial.rs | 3 +- safekeeper/src/wal_storage.rs | 2 +- safekeeper/tests/walproposer_sim/log.rs | 2 +- .../tests/walproposer_sim/safekeeper.rs | 2 +- .../tests/walproposer_sim/simulation.rs | 2 +- .../tests/walproposer_sim/walproposer_api.rs | 7 +- storage_broker/benches/rps.rs | 2 +- storage_broker/build.rs | 2 +- storage_broker/src/lib.rs | 10 +- storage_controller/src/compute_hook.rs | 2 +- storage_controller/src/drain_utils.rs | 6 +- storage_controller/src/http.rs | 6 +- storage_controller/src/persistence.rs | 23 ++--- storage_controller/src/scheduler.rs | 6 +- storage_controller/src/service.rs | 31 +++--- storage_controller/src/tenant_shard.rs | 9 +- storage_controller/src/timeline_import.rs | 2 +- storage_scrubber/src/checks.rs | 2 +- storage_scrubber/src/lib.rs | 2 +- .../src/scan_safekeeper_metadata.rs | 4 +- 112 files changed, 404 insertions(+), 542 deletions(-) diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index e65c210b23..682525f6df 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -486,10 +486,8 @@ async fn cmd_pgdata( }; let superuser = "cloud_admin"; - let destination_connstring = format!( - "host=localhost port={} user={} dbname=neondb", - pg_port, superuser - ); + let destination_connstring = + format!("host=localhost port={pg_port} user={superuser} dbname=neondb"); let pgdata_dir = workdir.join("pgdata"); let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone()); diff --git a/compute_tools/src/bin/fast_import/s3_uri.rs b/compute_tools/src/bin/fast_import/s3_uri.rs index cf4dab7c02..e1a85c73e7 100644 --- a/compute_tools/src/bin/fast_import/s3_uri.rs +++ b/compute_tools/src/bin/fast_import/s3_uri.rs @@ -69,7 +69,7 @@ impl clap::builder::TypedValueParser for S3Uri { S3Uri::from_str(value_str).map_err(|e| { clap::Error::raw( clap::error::ErrorKind::InvalidValue, - format!("Failed to parse S3 URI: {}", e), + format!("Failed to parse S3 URI: {e}"), ) }) } diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 082ba62b8e..bc9f64075a 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -22,7 +22,7 @@ pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let mut lines = stderr_reader.lines(); if let Some(line) = lines.next_line().await? { - if line.contains(&format!("FATAL: database \"{}\" does not exist", dbname)) { + if line.contains(&format!("FATAL: database \"{dbname}\" does not exist")) { return Err(SchemaDumpError::DatabaseDoesNotExist); } warn!("pg_dump stderr: {}", line) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 684d841897..f84a5f0841 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -250,8 +250,7 @@ impl ParsedSpec { // duplicate entry? if current == previous { return Err(format!( - "duplicate entry in safekeeper_connstrings: {}!", - current, + "duplicate entry in safekeeper_connstrings: {current}!", )); } @@ -410,7 +409,7 @@ impl ComputeNode { let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute - Some(options) => format!("{} {}", EXTRA_OPTIONS, options), + Some(options) => format!("{EXTRA_OPTIONS} {options}"), None => EXTRA_OPTIONS.to_string(), }; conn_conf.options(&options); @@ -1127,7 +1126,7 @@ impl ComputeNode { let sk_configs = sk_connstrs.into_iter().map(|connstr| { // Format connstr let id = connstr.clone(); - let connstr = format!("postgresql://no_user@{}", connstr); + let connstr = format!("postgresql://no_user@{connstr}"); let options = format!( "-c timeline_id={} tenant_id={}", pspec.timeline_id, pspec.tenant_id @@ -1490,7 +1489,7 @@ impl ComputeNode { let (mut client, connection) = conf.connect(NoTls).await?; tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -1633,7 +1632,7 @@ impl ComputeNode { Ok((mut client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); if let Err(e) = handle_migrations(&mut client).await { @@ -1937,7 +1936,7 @@ impl ComputeNode { let (client, connection) = connect_result.unwrap(); tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); let result = client @@ -2106,7 +2105,7 @@ LIMIT 100", db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(()) @@ -2133,7 +2132,7 @@ LIMIT 100", let version: Option = db_client .query_opt(version_query, &[&ext_name]) .await - .with_context(|| format!("Failed to execute query: {}", version_query))? + .with_context(|| format!("Failed to execute query: {version_query}"))? .map(|row| row.get(0)); // sanitize the inputs as postgres idents. @@ -2148,14 +2147,14 @@ LIMIT 100", db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } else { let query = format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}"); db_client .simple_query(&query) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {query}"))?; } Ok(ext_version) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 933b30134f..169de5c963 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -51,7 +51,7 @@ pub fn write_postgres_conf( // Write the postgresql.conf content from the spec file as is. if let Some(conf) = &spec.cluster.postgresql_conf { - writeln!(file, "{}", conf)?; + writeln!(file, "{conf}")?; } // Add options for connecting to storage @@ -70,7 +70,7 @@ pub fn write_postgres_conf( ); // If generation is given, prepend sk list with g#number: if let Some(generation) = spec.safekeepers_generation { - write!(neon_safekeepers_value, "g#{}:", generation)?; + write!(neon_safekeepers_value, "g#{generation}:")?; } neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(",")); writeln!( @@ -109,8 +109,8 @@ pub fn write_postgres_conf( tls::update_key_path_blocking(pgdata_path, tls_config); // these are the default, but good to be explicit. - writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?; - writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?; + writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?; + writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?; } // Locales @@ -191,8 +191,7 @@ pub fn write_postgres_conf( } writeln!( file, - "shared_preload_libraries='{}{}'", - libs, extra_shared_preload_libraries + "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, @@ -244,8 +243,7 @@ pub fn write_postgres_conf( } writeln!( file, - "shared_preload_libraries='{}{}'", - libs, extra_shared_preload_libraries + "shared_preload_libraries='{libs}{extra_shared_preload_libraries}'" )?; } else { // Typically, this should be unreacheable, @@ -263,7 +261,7 @@ pub fn write_postgres_conf( } } - writeln!(file, "neon.extension_server_port={}", extension_server_port)?; + writeln!(file, "neon.extension_server_port={extension_server_port}")?; if spec.drop_subscriptions_before_start { writeln!(file, "neon.disable_logical_replication_subscribers=true")?; @@ -291,7 +289,7 @@ where { let path = pgdata_path.join("compute_ctl_temp_override.conf"); let mut file = File::create(path)?; - write!(file, "{}", options)?; + write!(file, "{options}")?; let res = exec(); diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index 3764bc1525..d8d5de34a5 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -310,10 +310,7 @@ async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Re async fn do_extension_server_request(uri: Url) -> Result { let resp = reqwest::get(uri).await.map_err(|e| { ( - format!( - "could not perform remote extensions server request: {:?}", - e - ), + format!("could not perform remote extensions server request: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -323,7 +320,7 @@ async fn do_extension_server_request(uri: Url) -> Result match resp.bytes().await { Ok(resp) => Ok(resp), Err(e) => Err(( - format!("could not read remote extensions server response: {:?}", e), + format!("could not read remote extensions server response: {e:?}"), // It's fine to return and report error with status as 200 OK, // because we still failed to read the response. status.to_string(), @@ -334,10 +331,7 @@ async fn do_extension_server_request(uri: Url) -> Result Err(( - format!( - "unexpected remote extensions server response status code: {}", - status - ), + format!("unexpected remote extensions server response status code: {status}"), status.to_string(), )), } diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index c29e3a97da..b7325d283f 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -65,7 +65,7 @@ pub(in crate::http) async fn configure( if state.status == ComputeStatus::Failed { let err = state.error.as_ref().map_or("unknown error", |x| x); - let msg = format!("compute configuration failed: {:?}", err); + let msg = format!("compute configuration failed: {err:?}"); return Err(msg); } } diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index d95c168a99..411e03b7ec 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -43,7 +43,7 @@ pub async fn get_installed_extensions(mut conf: Config) -> Result Result Result> { let mut client = config.connect(NoTls)?; - let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn); + let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); let res = client.simple_query(&cmd)?; let msg = match res.first() { Some(msg) => msg, diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 94467a0d2f..0a3ceed2fa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -36,9 +36,9 @@ pub fn escape_literal(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); if res.contains('\\') { - format!("E'{}'", res) + format!("E'{res}'") } else { - format!("'{}'", res) + format!("'{res}'") } } @@ -46,7 +46,7 @@ pub fn escape_literal(s: &str) -> String { /// with `'{}'` is not required, as it returns a ready-to-use config string. pub fn escape_conf_value(s: &str) -> String { let res = s.replace('\'', "''").replace('\\', "\\\\"); - format!("'{}'", res) + format!("'{res}'") } pub trait GenericOptionExt { @@ -446,7 +446,7 @@ pub async fn tune_pgbouncer( let mut pgbouncer_connstr = "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string(); if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") { - pgbouncer_connstr.push_str(format!(" password={}", pass).as_str()); + pgbouncer_connstr.push_str(format!(" password={pass}").as_str()); } pgbouncer_connstr }; @@ -464,7 +464,7 @@ pub async fn tune_pgbouncer( Ok((client, connection)) => { tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); break client; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 4b38e6e29c..43cfbb48f7 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -23,12 +23,12 @@ fn do_control_plane_request( ) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) - .header("Authorization", format!("Bearer {}", jwt)) + .header("Authorization", format!("Bearer {jwt}")) .send() .map_err(|e| { ( true, - format!("could not perform request to control plane: {:?}", e), + format!("could not perform request to control plane: {e:?}"), UNKNOWN_HTTP_STATUS.to_string(), ) })?; @@ -39,7 +39,7 @@ fn do_control_plane_request( Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, - format!("could not deserialize control plane response: {:?}", e), + format!("could not deserialize control plane response: {e:?}"), status.to_string(), )), }, @@ -62,7 +62,7 @@ fn do_control_plane_request( // or some internal failure happened. Doesn't make much sense to retry in this case. _ => Err(( false, - format!("unexpected control plane response status code: {}", status), + format!("unexpected control plane response status code: {status}"), status.to_string(), )), } diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index 0d1389dbad..fcd072263a 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -933,56 +933,53 @@ async fn get_operations<'a>( PerDatabasePhase::DeleteDBRoleReferences => { let ctx = ctx.read().await; - let operations = - spec.delta_operations - .iter() - .flatten() - .filter(|op| op.action == "delete_role") - .filter_map(move |op| { - if db.is_owned_by(&op.name) { - return None; - } - if !ctx.roles.contains_key(&op.name) { - return None; - } - let quoted = op.name.pg_quote(); - let new_owner = match &db { - DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), - DB::UserDB(db) => db.owner.pg_quote(), - }; - let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); + let operations = spec + .delta_operations + .iter() + .flatten() + .filter(|op| op.action == "delete_role") + .filter_map(move |op| { + if db.is_owned_by(&op.name) { + return None; + } + if !ctx.roles.contains_key(&op.name) { + return None; + } + let quoted = op.name.pg_quote(); + let new_owner = match &db { + DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(), + DB::UserDB(db) => db.owner.pg_quote(), + }; + let (escaped_role, outer_tag) = op.name.pg_quote_dollar(); - Some(vec![ - // This will reassign all dependent objects to the db owner - Operation { - query: format!( - "REASSIGN OWNED BY {} TO {}", - quoted, new_owner, - ), - comment: None, - }, - // Revoke some potentially blocking privileges (Neon-specific currently) - Operation { - query: format!( - include_str!("sql/pre_drop_role_revoke_privileges.sql"), - // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` - role_name = escaped_role, - outer_tag = outer_tag, - ), - comment: None, - }, - // This now will only drop privileges of the role - // TODO: this is obviously not 100% true because of the above case, - // there could be still some privileges that are not revoked. Maybe this - // only drops privileges that were granted *by this* role, not *to this* role, - // but this has to be checked. - Operation { - query: format!("DROP OWNED BY {}", quoted), - comment: None, - }, - ]) - }) - .flatten(); + Some(vec![ + // This will reassign all dependent objects to the db owner + Operation { + query: format!("REASSIGN OWNED BY {quoted} TO {new_owner}",), + comment: None, + }, + // Revoke some potentially blocking privileges (Neon-specific currently) + Operation { + query: format!( + include_str!("sql/pre_drop_role_revoke_privileges.sql"), + // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()` + role_name = escaped_role, + outer_tag = outer_tag, + ), + comment: None, + }, + // This now will only drop privileges of the role + // TODO: this is obviously not 100% true because of the above case, + // there could be still some privileges that are not revoked. Maybe this + // only drops privileges that were granted *by this* role, not *to this* role, + // but this has to be checked. + Operation { + query: format!("DROP OWNED BY {quoted}"), + comment: None, + }, + ]) + }) + .flatten(); Ok(Box::new(operations)) } diff --git a/compute_tools/src/sync_sk.rs b/compute_tools/src/sync_sk.rs index 22b7027b93..6c348644b2 100644 --- a/compute_tools/src/sync_sk.rs +++ b/compute_tools/src/sync_sk.rs @@ -27,7 +27,7 @@ pub async fn ping_safekeeper( let (client, conn) = config.connect(tokio_postgres::NoTls).await?; tokio::spawn(async move { if let Err(e) = conn.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 21f55336aa..b2dd1a7077 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -919,7 +919,7 @@ fn print_timeline( br_sym = "┗━"; } - print!("{} @{}: ", br_sym, ancestor_lsn); + print!("{br_sym} @{ancestor_lsn}: "); } // Finally print a timeline id and name with new line @@ -1742,7 +1742,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> StopMode::Immediate => true, }; if let Err(e) = get_pageserver(env, args.pageserver_id)?.stop(immediate) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver stop failed: {e}"); exit(1); } } @@ -1751,7 +1751,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> let pageserver = get_pageserver(env, args.pageserver_id)?; //TODO what shutdown strategy should we use here? if let Err(e) = pageserver.stop(false) { - eprintln!("pageserver stop failed: {}", e); + eprintln!("pageserver stop failed: {e}"); exit(1); } @@ -1768,7 +1768,7 @@ async fn handle_pageserver(subcmd: &PageserverCmd, env: &local_env::LocalEnv) -> { Ok(_) => println!("Page server is up and running"), Err(err) => { - eprintln!("Page server is not available: {}", err); + eprintln!("Page server is not available: {err}"); exit(1); } } @@ -1805,7 +1805,7 @@ async fn handle_storage_controller( }, }; if let Err(e) = svc.stop(stop_args).await { - eprintln!("stop failed: {}", e); + eprintln!("stop failed: {e}"); exit(1); } } @@ -1827,7 +1827,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> let safekeeper = get_safekeeper(env, args.id)?; if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { - eprintln!("safekeeper start failed: {}", e); + eprintln!("safekeeper start failed: {e}"); exit(1); } } @@ -1839,7 +1839,7 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> StopMode::Immediate => true, }; if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); + eprintln!("safekeeper stop failed: {e}"); exit(1); } } @@ -1852,12 +1852,12 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> }; if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper stop failed: {}", e); + eprintln!("safekeeper stop failed: {e}"); exit(1); } if let Err(e) = safekeeper.start(&args.extra_opt, &args.start_timeout).await { - eprintln!("safekeeper start failed: {}", e); + eprintln!("safekeeper start failed: {e}"); exit(1); } } @@ -2113,7 +2113,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { let storage = EndpointStorage::from_env(env); if let Err(e) = storage.stop(immediate) { - eprintln!("endpoint_storage stop failed: {:#}", e); + eprintln!("endpoint_storage stop failed: {e:#}"); } for ps_conf in &env.pageservers { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index ae81e7abbe..dab53b0f27 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -846,10 +846,10 @@ impl Endpoint { // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); - println!("Starting postgres node at '{}'", conn_str); + println!("Starting postgres node at '{conn_str}'"); if create_test_user { let conn_str = self.connstr("test", "neondb"); - println!("Also at '{}'", conn_str); + println!("Also at '{conn_str}'"); } let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); cmd.args([ @@ -948,8 +948,7 @@ impl Endpoint { Err(e) => { if Instant::now().duration_since(start_at) > start_timeout { return Err(e).context(format!( - "timed out {:?} waiting to connect to compute_ctl HTTP", - start_timeout, + "timed out {start_timeout:?} waiting to connect to compute_ctl HTTP", )); } } @@ -988,7 +987,7 @@ impl Endpoint { // reqwest does not export its error construction utility functions, so let's craft the message ourselves let url = response.url().to_owned(); let msg = match response.text().await { - Ok(err_body) => format!("Error: {}", err_body), + Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) @@ -1054,7 +1053,7 @@ impl Endpoint { } else { let url = response.url().to_owned(); let msg = match response.text().await { - Ok(err_body) => format!("Error: {}", err_body), + Ok(err_body) => format!("Error: {err_body}"), Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url), }; Err(anyhow::anyhow!(msg)) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index e8abde4901..34465b4d5d 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -260,7 +260,7 @@ impl Default for EndpointStorageConf { impl NeonBroker { pub fn client_url(&self) -> Url { let url = if let Some(addr) = self.listen_https_addr { - format!("https://{}", addr) + format!("https://{addr}") } else { format!( "http://{}", @@ -733,7 +733,7 @@ impl LocalEnv { let config_toml_path = dentry.path().join("pageserver.toml"); let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&config_toml_path) - .with_context(|| format!("read {:?}", config_toml_path))?, + .with_context(|| format!("read {config_toml_path:?}"))?, ) .context("parse pageserver.toml")?; let identity_toml_path = dentry.path().join("identity.toml"); @@ -743,7 +743,7 @@ impl LocalEnv { } let identity_toml: IdentityTomlSubset = toml_edit::de::from_str( &std::fs::read_to_string(&identity_toml_path) - .with_context(|| format!("read {:?}", identity_toml_path))?, + .with_context(|| format!("read {identity_toml_path:?}"))?, ) .context("parse identity.toml")?; let PageserverConfigTomlSubset { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index a683d2daec..7fa00a6730 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -121,7 +121,7 @@ impl PageServerNode { .env .generate_auth_token(&Claims::new(None, Scope::GenerationsApi)) .unwrap(); - overrides.push(format!("control_plane_api_token='{}'", jwt_token)); + overrides.push(format!("control_plane_api_token='{jwt_token}'")); } if !conf.other.contains_key("remote_storage") { diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 28d369a315..da9dafd8e9 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -143,7 +143,7 @@ impl SafekeeperNode { let id_string = id.to_string(); // TODO: add availability_zone to the config. // Right now we just specify any value here and use it to check metrics in tests. - let availability_zone = format!("sk-{}", id_string); + let availability_zone = format!("sk-{id_string}"); let mut args = vec![ "-D".to_owned(), diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 334949924c..f2ac5bb2dd 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -167,7 +167,7 @@ impl StorageController { fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf { self.env .base_data_dir - .join(format!("storage_controller_{}", instance_id)) + .join(format!("storage_controller_{instance_id}")) } fn pid_file(&self, instance_id: u8) -> Utf8PathBuf { @@ -220,7 +220,7 @@ impl StorageController { "-d", DB_NAME, "-p", - &format!("{}", postgres_port), + &format!("{postgres_port}"), ]; let pg_lib_dir = self.get_pg_lib_dir().await.unwrap(); let envs = [ @@ -263,7 +263,7 @@ impl StorageController { "-h", "localhost", "-p", - &format!("{}", postgres_port), + &format!("{postgres_port}"), "-U", &username(), "-O", @@ -425,7 +425,7 @@ impl StorageController { // from `LocalEnv`'s config file (`.neon/config`). tokio::fs::write( &pg_data_path.join("postgresql.conf"), - format!("port = {}\nfsync=off\n", postgres_port), + format!("port = {postgres_port}\nfsync=off\n"), ) .await?; @@ -477,7 +477,7 @@ impl StorageController { self.setup_database(postgres_port).await?; } - let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port); + let database_url = format!("postgresql://localhost:{postgres_port}/{DB_NAME}"); // We support running a startup SQL script to fiddle with the database before we launch storcon. // This is used by the test suite. @@ -508,7 +508,7 @@ impl StorageController { drop(client); conn.await??; - let addr = format!("{}:{}", host, listen_port); + let addr = format!("{host}:{listen_port}"); let address_for_peers = Uri::builder() .scheme(scheme) .authority(addr.clone()) @@ -810,9 +810,9 @@ impl StorageController { builder = builder.json(&body) } if let Some(private_key) = &self.private_key { - println!("Getting claims for path {}", path); + println!("Getting claims for path {path}"); if let Some(required_claims) = Self::get_claims_for_path(&path)? { - println!("Got claims {:?} for path {}", required_claims, path); + println!("Got claims {required_claims:?} for path {path}"); let jwt_token = encode_from_key_file(&required_claims, private_key)?; builder = builder.header( reqwest::header::AUTHORIZATION, diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 507190b1e0..0036b7d0f6 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -649,7 +649,7 @@ async fn main() -> anyhow::Result<()> { response .new_shards .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -771,8 +771,8 @@ async fn main() -> anyhow::Result<()> { println!("Tenant {tenant_id}"); let mut table = comfy_table::Table::new(); - table.add_row(["Policy", &format!("{:?}", policy)]); - table.add_row(["Stripe size", &format!("{:?}", stripe_size)]); + table.add_row(["Policy", &format!("{policy:?}")]); + table.add_row(["Stripe size", &format!("{stripe_size:?}")]); table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]); println!("{table}"); println!("Shards:"); @@ -789,7 +789,7 @@ async fn main() -> anyhow::Result<()> { let secondary = shard .node_secondary .iter() - .map(|n| format!("{}", n)) + .map(|n| format!("{n}")) .collect::>() .join(","); @@ -863,7 +863,7 @@ async fn main() -> anyhow::Result<()> { } } else { // Make it obvious to the user that since they've omitted an AZ, we're clearing it - eprintln!("Clearing preferred AZ for tenant {}", tenant_id); + eprintln!("Clearing preferred AZ for tenant {tenant_id}"); } // Construct a request that modifies all the tenant's shards @@ -1134,8 +1134,7 @@ async fn main() -> anyhow::Result<()> { Err((tenant_shard_id, from, to, error)) => { failure += 1; println!( - "Failed to migrate {} from node {} to node {}: {}", - tenant_shard_id, from, to, error + "Failed to migrate {tenant_shard_id} from node {from} to node {to}: {error}" ); } } @@ -1277,8 +1276,7 @@ async fn main() -> anyhow::Result<()> { concurrency, } => { let mut path = format!( - "/v1/tenant/{}/timeline/{}/download_heatmap_layers", - tenant_shard_id, timeline_id, + "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers", ); if let Some(c) = concurrency { @@ -1303,8 +1301,7 @@ async fn watch_tenant_shard( ) -> anyhow::Result<()> { if let Some(until_migrated_to) = until_migrated_to { println!( - "Waiting for tenant shard {} to be migrated to node {}", - tenant_shard_id, until_migrated_to + "Waiting for tenant shard {tenant_shard_id} to be migrated to node {until_migrated_to}" ); } @@ -1327,7 +1324,7 @@ async fn watch_tenant_shard( "attached: {} secondary: {} {}", shard .node_attached - .map(|n| format!("{}", n)) + .map(|n| format!("{n}")) .unwrap_or("none".to_string()), shard .node_secondary @@ -1341,15 +1338,12 @@ async fn watch_tenant_shard( "(reconciler idle)" } ); - println!("{}", summary); + println!("{summary}"); // Maybe drop out if we finished migration if let Some(until_migrated_to) = until_migrated_to { if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling { - println!( - "Tenant shard {} is now on node {}", - tenant_shard_id, until_migrated_to - ); + println!("Tenant shard {tenant_shard_id} is now on node {until_migrated_to}"); break; } } diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index f44efe6d7a..42431c0066 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -374,7 +374,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH let request = Request::builder() .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key")) .method(method) - .header("Authorization", format!("Bearer {}", token)) + .header("Authorization", format!("Bearer {token}")) .body(Body::empty()) .unwrap(); let status = ServiceExt::ready(&mut app) diff --git a/libs/desim/src/executor.rs b/libs/desim/src/executor.rs index 51b11ff97e..bdb9c6cd4b 100644 --- a/libs/desim/src/executor.rs +++ b/libs/desim/src/executor.rs @@ -71,7 +71,7 @@ impl Runtime { debug!("thread panicked: {:?}", e); let mut result = ctx.result.lock(); if result.0 == -1 { - *result = (256, format!("thread panicked: {:?}", e)); + *result = (256, format!("thread panicked: {e:?}")); } }); } diff --git a/libs/desim/src/proto.rs b/libs/desim/src/proto.rs index 31bc29e6a6..7c3de4ff4b 100644 --- a/libs/desim/src/proto.rs +++ b/libs/desim/src/proto.rs @@ -47,8 +47,8 @@ impl Debug for AnyMessage { match self { AnyMessage::None => write!(f, "None"), AnyMessage::InternalConnect => write!(f, "InternalConnect"), - AnyMessage::Just32(v) => write!(f, "Just32({})", v), - AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v), + AnyMessage::Just32(v) => write!(f, "Just32({v})"), + AnyMessage::ReplCell(v) => write!(f, "ReplCell({v:?})"), AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)), AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)), } diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs index 64147f2dd0..f32ced1180 100644 --- a/libs/http-utils/src/endpoint.rs +++ b/libs/http-utils/src/endpoint.rs @@ -582,14 +582,14 @@ pub fn attach_openapi_ui( deepLinking: true, showExtensions: true, showCommonExtensions: true, - url: "{}", + url: "{spec_mount_path}", }}) window.ui = ui; }}; - "#, spec_mount_path))).unwrap()) + "#))).unwrap()) }) ) } @@ -696,7 +696,7 @@ mod tests { let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { - panic!("request service is not ready: {:?}", e); + panic!("request service is not ready: {e:?}"); } let mut req: Request = Request::default(); @@ -716,7 +716,7 @@ mod tests { let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80); let mut service = builder.build(remote_addr); if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await { - panic!("request service is not ready: {:?}", e); + panic!("request service is not ready: {e:?}"); } let req: Request = Request::default(); diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs index e1b14b1371..c689959b68 100644 --- a/libs/neon-shmem/src/lib.rs +++ b/libs/neon-shmem/src/lib.rs @@ -86,7 +86,7 @@ impl ShmemHandle { // somewhat smaller than that, because with anything close to that, you'll run out of // memory anyway. if max_size >= 1 << 48 { - panic!("max size {} too large", max_size); + panic!("max size {max_size} too large"); } if initial_size > max_size { panic!("initial size {initial_size} larger than max size {max_size}"); @@ -279,7 +279,7 @@ mod tests { fn assert_range(ptr: *const u8, expected: u8, range: Range) { for i in range { let b = unsafe { *(ptr.add(i)) }; - assert_eq!(expected, b, "unexpected byte at offset {}", i); + assert_eq!(expected, b, "unexpected byte at offset {i}"); } } diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 55495dd68e..ff18d40bfe 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -577,8 +577,7 @@ mod test { let err = serde_json::from_value::(create_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err + "expect unknown field `unknown_field` error, got: {err}" ); } diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 1b48d3c462..10a242e13b 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -334,8 +334,7 @@ impl KeySpace { std::cmp::max(range.start, prev.start) < std::cmp::min(range.end, prev.end); assert!( !overlap, - "Attempt to merge ovelapping keyspaces: {:?} overlaps {:?}", - prev, range + "Attempt to merge ovelapping keyspaces: {prev:?} overlaps {range:?}" ); } @@ -1104,7 +1103,7 @@ mod tests { // total range contains at least one shard-local page let all_nonzero = fragments.iter().all(|f| f.0 > 0); if !all_nonzero { - eprintln!("Found a zero-length fragment: {:?}", fragments); + eprintln!("Found a zero-length fragment: {fragments:?}"); } assert!(all_nonzero); } else { diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2fc32c8f49..ee6725efbe 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1182,7 +1182,7 @@ impl Display for ImageCompressionAlgorithm { ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), ImageCompressionAlgorithm::Zstd { level } => { if let Some(level) = level { - write!(f, "zstd({})", level) + write!(f, "zstd({level})") } else { write!(f, "zstd") } @@ -2011,8 +2011,7 @@ mod tests { let err = serde_json::from_value::(config_request).unwrap_err(); assert!( err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err + "expect unknown field `unknown_field` error, got: {err}" ); } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 714d8ac403..091299f842 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -939,7 +939,7 @@ impl PostgresBackendReader { FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail), FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate), _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol( - ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)), + ProtocolError::Protocol(format!("unexpected message in COPY stream {msg:?}")), ))), }, None => Err(CopyStreamHandlerEnd::EOF), diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 75ca123014..23e17799bd 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -61,7 +61,7 @@ async fn simple_select() { // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -137,7 +137,7 @@ async fn simple_select_ssl() { // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs index cd981b3729..2388303329 100644 --- a/libs/postgres_connection/src/lib.rs +++ b/libs/postgres_connection/src/lib.rs @@ -223,7 +223,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: None }" ); } @@ -239,7 +239,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "[::1]:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Ipv6(::1), port: 123, password: None }" ); } @@ -252,7 +252,7 @@ mod tests_pg_connection_config { assert_eq!(cfg.port(), 123); assert_eq!(cfg.raw_address(), "stub.host.example:123"); assert_eq!( - format!("{:?}", cfg), + format!("{cfg:?}"), "PgConnectionConfig { host: Domain(\"stub.host.example\"), port: 123, password: Some(REDACTED-STRING) }" ); } diff --git a/libs/postgres_ffi/src/waldecoder_handler.rs b/libs/postgres_ffi/src/waldecoder_handler.rs index b4d50375bd..9cd40645ec 100644 --- a/libs/postgres_ffi/src/waldecoder_handler.rs +++ b/libs/postgres_ffi/src/waldecoder_handler.rs @@ -114,7 +114,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err( |e| WalDecodeError { - msg: format!("long header deserialization failed {}", e), + msg: format!("long header deserialization failed {e}"), lsn: self.lsn, }, )?; @@ -130,7 +130,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| { WalDecodeError { - msg: format!("header deserialization failed {}", e), + msg: format!("header deserialization failed {e}"), lsn: self.lsn, } })?; @@ -155,7 +155,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le(); if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD { return Err(WalDecodeError { - msg: format!("invalid xl_tot_len {}", xl_tot_len), + msg: format!("invalid xl_tot_len {xl_tot_len}"), lsn: self.lsn, }); } @@ -218,7 +218,7 @@ impl WalStreamDecoderHandler for WalStreamDecoder { let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| { WalDecodeError { - msg: format!("xlog record deserialization failed {}", e), + msg: format!("xlog record deserialization failed {e}"), lsn: self.lsn, } })?; diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index 1ccf4590a9..c0ae88363e 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -1199,7 +1199,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "HEAP2 MULTI_INSERT", pg_constants::XLOG_HEAP2_VISIBLE => "HEAP2 VISIBLE", _ => { - unknown_str = format!("HEAP2 UNKNOWN_0x{:02x}", info); + unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1212,7 +1212,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "HEAP UPDATE", pg_constants::XLOG_HEAP_HOT_UPDATE => "HEAP HOT_UPDATE", _ => { - unknown_str = format!("HEAP2 UNKNOWN_0x{:02x}", info); + unknown_str = format!("HEAP2 UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1223,7 +1223,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result "XLOG FPI", pg_constants::XLOG_FPI_FOR_HINT => "XLOG FPI_FOR_HINT", _ => { - unknown_str = format!("XLOG UNKNOWN_0x{:02x}", info); + unknown_str = format!("XLOG UNKNOWN_0x{info:02x}"); &unknown_str } } @@ -1231,7 +1231,7 @@ pub fn describe_postgres_wal_record(record: &Bytes) -> Result { let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK; - unknown_str = format!("UNKNOWN_RM_{} INFO_0x{:02x}", rmid, info); + unknown_str = format!("UNKNOWN_RM_{rmid} INFO_0x{info:02x}"); &unknown_str } }; diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 4a33dbe25b..94371a35b5 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -34,7 +34,7 @@ fn test_end_of_wal(test_name: &str) { let cfg = Conf { pg_version, pg_distrib_dir: top_path.join("pg_install"), - datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)), + datadir: top_path.join(format!("test_output/{test_name}-{PG_MAJORVERSION}")), }; if cfg.datadir.exists() { fs::remove_dir_all(&cfg.datadir).unwrap(); diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index ed54696861..4b4a597f73 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -31,15 +31,15 @@ pub enum Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Error::Spawn(e) => write!(f, "Error spawning command: {:?}", e), + Error::Spawn(e) => write!(f, "Error spawning command: {e:?}"), Error::Failed { status, stderr } => write!( f, "Command failed with status {:?}: {}", status, String::from_utf8_lossy(stderr) ), - Error::WaitOutput(e) => write!(f, "Error waiting for command output: {:?}", e), - Error::Other(e) => write!(f, "Error: {:?}", e), + Error::WaitOutput(e) => write!(f, "Error waiting for command output: {e:?}"), + Error::Other(e) => write!(f, "Error: {e:?}"), } } } diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index 730878fb58..f21047bcfc 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -168,15 +168,13 @@ impl FeatureStore { let PostHogFlagFilterPropertyValue::String(provided) = provided else { // Left should be a string return Err(PostHogEvaluationError::Internal(format!( - "The left side of the condition is not a string: {:?}", - provided + "The left side of the condition is not a string: {provided:?}" ))); }; let PostHogFlagFilterPropertyValue::List(requested) = requested else { // Right should be a list of string return Err(PostHogEvaluationError::Internal(format!( - "The right side of the condition is not a list: {:?}", - requested + "The right side of the condition is not a list: {requested:?}" ))); }; Ok(requested.contains(provided)) @@ -185,14 +183,12 @@ impl FeatureStore { let PostHogFlagFilterPropertyValue::String(requested) = requested else { // Right should be a string return Err(PostHogEvaluationError::Internal(format!( - "The right side of the condition is not a string: {:?}", - requested + "The right side of the condition is not a string: {requested:?}" ))); }; let Ok(requested) = requested.parse::() else { return Err(PostHogEvaluationError::Internal(format!( - "Can not parse the right side of the condition as a number: {:?}", - requested + "Can not parse the right side of the condition as a number: {requested:?}" ))); }; // Left can either be a number or a string @@ -201,16 +197,14 @@ impl FeatureStore { PostHogFlagFilterPropertyValue::String(provided) => { let Ok(provided) = provided.parse::() else { return Err(PostHogEvaluationError::Internal(format!( - "Can not parse the left side of the condition as a number: {:?}", - provided + "Can not parse the left side of the condition as a number: {provided:?}" ))); }; provided } _ => { return Err(PostHogEvaluationError::Internal(format!( - "The left side of the condition is not a number or a string: {:?}", - provided + "The left side of the condition is not a number or a string: {provided:?}" ))); } }; @@ -218,14 +212,12 @@ impl FeatureStore { "lt" => Ok(provided < requested), "gt" => Ok(provided > requested), op => Err(PostHogEvaluationError::Internal(format!( - "Unsupported operator: {}", - op + "Unsupported operator: {op}" ))), } } _ => Err(PostHogEvaluationError::Internal(format!( - "Unsupported operator: {}", - operator + "Unsupported operator: {operator}" ))), } } @@ -373,8 +365,7 @@ impl FeatureStore { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( - "The feature flag is not active: {}", - flag_key + "The feature flag is not active: {flag_key}" ))); } let Some(ref multivariate) = flag_config.filters.multivariate else { @@ -401,8 +392,7 @@ impl FeatureStore { // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog // returned invalid spec, we return an error. return Err(PostHogEvaluationError::Internal(format!( - "Rollout percentage does not add up to 100: {}", - flag_key + "Rollout percentage does not add up to 100: {flag_key}" ))); } GroupEvaluationResult::Unmatched => continue, @@ -413,8 +403,7 @@ impl FeatureStore { } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } @@ -440,8 +429,7 @@ impl FeatureStore { if let Some(flag_config) = self.flags.get(flag_key) { if !flag_config.active { return Err(PostHogEvaluationError::NotAvailable(format!( - "The feature flag is not active: {}", - flag_key + "The feature flag is not active: {flag_key}" ))); } if flag_config.filters.multivariate.is_some() { @@ -456,8 +444,7 @@ impl FeatureStore { match self.evaluate_group(group, hash_on_global_rollout_percentage, properties)? { GroupEvaluationResult::MatchedAndOverride(_) => { return Err(PostHogEvaluationError::Internal(format!( - "Boolean flag cannot have overrides: {}", - flag_key + "Boolean flag cannot have overrides: {flag_key}" ))); } GroupEvaluationResult::MatchedAndEvaluate => { @@ -471,8 +458,7 @@ impl FeatureStore { } else { // The feature flag is not available yet Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } @@ -483,8 +469,7 @@ impl FeatureStore { Ok(flag_config.filters.multivariate.is_none()) } else { Err(PostHogEvaluationError::NotAvailable(format!( - "Not found in the local evaluation spec: {}", - flag_key + "Not found in the local evaluation spec: {flag_key}" ))) } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index e7afc64564..482dd9a298 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -198,7 +198,7 @@ impl fmt::Display for CancelKeyData { // This format is more compact and might work better for logs. f.debug_tuple("CancelKeyData") - .field(&format_args!("{:x}", id)) + .field(&format_args!("{id:x}")) .finish() } } @@ -291,8 +291,7 @@ impl FeMessage { let len = (&buf[1..5]).read_u32::().unwrap(); if len < 4 { return Err(ProtocolError::Protocol(format!( - "invalid message length {}", - len + "invalid message length {len}" ))); } @@ -367,8 +366,7 @@ impl FeStartupPacket { #[allow(clippy::manual_range_contains)] if len < 8 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( - "invalid startup packet message length {}", - len + "invalid startup packet message length {len}" ))); } diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index a7bf3da20a..b8304f9d8d 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -308,7 +308,7 @@ impl ScramSha256 { let verifier = match parsed { ServerFinalMessage::Error(e) => { - return Err(io::Error::other(format!("SCRAM error: {}", e))); + return Err(io::Error::other(format!("SCRAM error: {e}"))); } ServerFinalMessage::Verifier(verifier) => verifier, }; @@ -343,10 +343,8 @@ impl<'a> Parser<'a> { match self.it.next() { Some((_, c)) if c == target => Ok(()), Some((i, c)) => { - let m = format!( - "unexpected character at byte {}: expected `{}` but got `{}", - i, target, c - ); + let m = + format!("unexpected character at byte {i}: expected `{target}` but got `{c}"); Err(io::Error::new(io::ErrorKind::InvalidInput, m)) } None => Err(io::Error::new( @@ -412,7 +410,7 @@ impl<'a> Parser<'a> { match self.it.peek() { Some(&(i, _)) => Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unexpected trailing data at byte {}", i), + format!("unexpected trailing data at byte {i}"), )), None => Ok(()), } diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index d7eaef9509..3fc9a9335c 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -211,7 +211,7 @@ impl Message { tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unknown authentication tag `{}`", tag), + format!("unknown authentication tag `{tag}`"), )); } }, @@ -238,7 +238,7 @@ impl Message { tag => { return Err(io::Error::new( io::ErrorKind::InvalidInput, - format!("unknown message tag `{}`", tag), + format!("unknown message tag `{tag}`"), )); } }; diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 7c9874bda3..c98c45636b 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -46,7 +46,7 @@ impl fmt::Display for Type { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self.schema() { "public" | "pg_catalog" => {} - schema => write!(fmt, "{}.", schema)?, + schema => write!(fmt, "{schema}.")?, } fmt.write_str(self.name()) } diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 8149bceeb9..5309bce17e 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -332,10 +332,10 @@ impl fmt::Display for DbError { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { write!(fmt, "{}: {}", self.severity, self.message)?; if let Some(detail) = &self.detail { - write!(fmt, "\nDETAIL: {}", detail)?; + write!(fmt, "\nDETAIL: {detail}")?; } if let Some(hint) = &self.hint { - write!(fmt, "\nHINT: {}", hint)?; + write!(fmt, "\nHINT: {hint}")?; } Ok(()) } @@ -398,9 +398,9 @@ impl fmt::Display for Error { Kind::Io => fmt.write_str("error communicating with the server")?, Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?, Kind::Tls => fmt.write_str("error performing TLS handshake")?, - Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?, - Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?, - Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?, + Kind::ToSql(idx) => write!(fmt, "error serializing parameter {idx}")?, + Kind::FromSql(idx) => write!(fmt, "error deserializing column {idx}")?, + Kind::Column(column) => write!(fmt, "invalid column `{column}`")?, Kind::Closed => fmt.write_str("connection closed")?, Kind::Db => fmt.write_str("db error")?, Kind::Parse => fmt.write_str("error parsing response from server")?, @@ -411,7 +411,7 @@ impl fmt::Display for Error { Kind::Timeout => fmt.write_str("timeout waiting for server")?, }; if let Some(ref cause) = self.0.cause { - write!(fmt, ": {}", cause)?; + write!(fmt, ": {cause}")?; } Ok(()) } diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs index 5fc955eef4..36d578558f 100644 --- a/libs/proxy/tokio-postgres2/src/row.rs +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -156,7 +156,7 @@ impl Row { { match self.get_inner(&idx) { Ok(ok) => ok, - Err(err) => panic!("error retrieving column {}: {}", idx, err), + Err(err) => panic!("error retrieving column {idx}: {err}"), } } @@ -274,7 +274,7 @@ impl SimpleQueryRow { { match self.get_inner(&idx) { Ok(ok) => ok, - Err(err) => panic!("error retrieving column {}: {}", idx, err), + Err(err) => panic!("error retrieving column {idx}: {err}"), } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 8320d7afdc..30690b1bdb 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -400,7 +400,7 @@ impl RemoteStorage for LocalFs { key }; - let relative_key = format!("{}", relative_key); + let relative_key = format!("{relative_key}"); if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) { let first_part = relative_key .split(REMOTE_STORAGE_PREFIX_SEPARATOR) @@ -594,13 +594,9 @@ impl RemoteStorage for LocalFs { let from_path = from.with_base(&self.storage_root); let to_path = to.with_base(&self.storage_root); create_target_directory(&to_path).await?; - fs::copy(&from_path, &to_path).await.with_context(|| { - format!( - "Failed to copy file from '{from_path}' to '{to_path}'", - from_path = from_path, - to_path = to_path - ) - })?; + fs::copy(&from_path, &to_path) + .await + .with_context(|| format!("Failed to copy file from '{from_path}' to '{to_path}'"))?; Ok(()) } @@ -1183,7 +1179,7 @@ mod fs_tests { .write(true) .create_new(true) .open(path)?; - write!(file_for_writing, "{}", contents)?; + write!(file_for_writing, "{contents}")?; drop(file_for_writing); let file_size = path.metadata()?.len() as usize; Ok(( diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs index 3d4d17096e..1751c54f6a 100644 --- a/libs/safekeeper_api/src/membership.rs +++ b/libs/safekeeper_api/src/membership.rs @@ -193,10 +193,10 @@ mod tests { }) .unwrap(); - println!("members: {}", members); + println!("members: {members}"); let j = serde_json::to_string(&members).expect("failed to serialize"); - println!("members json: {}", j); + println!("members json: {j}"); assert_eq!( j, r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"# diff --git a/libs/utils/src/error.rs b/libs/utils/src/error.rs index 7ce203e918..6fa86916c1 100644 --- a/libs/utils/src/error.rs +++ b/libs/utils/src/error.rs @@ -41,7 +41,7 @@ pub fn report_compact_sources(e: &E) -> impl std::fmt::Dis // why is E a generic parameter here? hope that rustc will see through a default // Error::source implementation and leave the following out if there cannot be any // sources: - Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src)) + Sources(self.0.source()).try_for_each(|src| write!(f, ": {src}")) } } diff --git a/libs/utils/src/generation.rs b/libs/utils/src/generation.rs index b5e4a4644a..8a3bef914a 100644 --- a/libs/utils/src/generation.rs +++ b/libs/utils/src/generation.rs @@ -135,7 +135,7 @@ impl Debug for Generation { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Valid(v) => { - write!(f, "{:08x}", v) + write!(f, "{v:08x}") } Self::None => { write!(f, "") diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 68cb1f0209..e3037aec21 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -280,7 +280,7 @@ impl TryFrom> for TimelineId { value .unwrap_or_default() .parse::() - .with_context(|| format!("Could not parse timeline id from {:?}", value)) + .with_context(|| format!("Could not parse timeline id from {value:?}")) } } diff --git a/libs/utils/src/postgres_client.rs b/libs/utils/src/postgres_client.rs index 4167839e28..7596fefe38 100644 --- a/libs/utils/src/postgres_client.rs +++ b/libs/utils/src/postgres_client.rs @@ -89,7 +89,7 @@ pub fn wal_stream_connection_config( .set_password(args.auth_token.map(|s| s.to_owned())); if let Some(availability_zone) = args.availability_zone { - connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]); + connstr = connstr.extend_options([format!("availability_zone={availability_zone}")]); } Ok(connstr) diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs index c8c410a725..f2b81373e2 100644 --- a/libs/utils/src/shard.rs +++ b/libs/utils/src/shard.rs @@ -196,7 +196,7 @@ impl std::fmt::Display for TenantShardId { impl std::fmt::Debug for TenantShardId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) + write!(f, "{self}") } } @@ -284,7 +284,7 @@ impl std::fmt::Display for ShardIndex { impl std::fmt::Debug for ShardIndex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) + write!(f, "{self}") } } diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 426bb65916..bdaa3cb665 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -29,7 +29,7 @@ impl ShutdownSignals { SIGINT => Signal::Interrupt, SIGTERM => Signal::Terminate, SIGQUIT => Signal::Quit, - other => panic!("unknown signal: {}", other), + other => panic!("unknown signal: {other}"), }; handler(signal)?; diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs index 7b7201ab77..7bd6adc2f8 100644 --- a/libs/vm_monitor/src/dispatcher.rs +++ b/libs/vm_monitor/src/dispatcher.rs @@ -90,8 +90,7 @@ impl Dispatcher { Err(e) => { sink.send(Message::Text(Utf8Bytes::from( serde_json::to_string(&ProtocolResponse::Error(format!( - "Received protocol version range {} which does not overlap with {}", - agent_range, monitor_range + "Received protocol version range {agent_range} which does not overlap with {monitor_range}" ))) .unwrap(), ))) diff --git a/libs/vm_monitor/src/filecache.rs b/libs/vm_monitor/src/filecache.rs index bc42347e5a..55bbdea169 100644 --- a/libs/vm_monitor/src/filecache.rs +++ b/libs/vm_monitor/src/filecache.rs @@ -285,7 +285,7 @@ impl FileCacheState { // why we're constructing the query here. self.client .query( - &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {};", num_mb), + &format!("ALTER SYSTEM SET neon.file_cache_size_limit = {num_mb};"), &[], ) .await diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index ed6ba4d267..ff860a92e2 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -64,7 +64,7 @@ async fn download_bench_data( let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?; let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?; - eprintln!("Downloading benchmark data to {:?}", temp_dir); + eprintln!("Downloading benchmark data to {temp_dir:?}"); let listing = client .list(None, ListingMode::NoDelimiter, None, cancel) @@ -120,7 +120,7 @@ struct BenchmarkMetadata { } async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result { - eprintln!("Loading benchmark data from {:?}", path); + eprintln!("Loading benchmark data from {path:?}"); let mut entries = tokio::fs::read_dir(path).await?; let mut ordered_segment_paths = Vec::new(); diff --git a/libs/wal_decoder/build.rs b/libs/wal_decoder/build.rs index d5b7ad02ad..e8acb52256 100644 --- a/libs/wal_decoder/build.rs +++ b/libs/wal_decoder/build.rs @@ -6,6 +6,6 @@ fn main() -> Result<(), Box> { // the build then. Anyway, per cargo docs build script shouldn't output to // anywhere but $OUT_DIR. tonic_build::compile_protos("proto/interpreted_wal.proto") - .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e)); + .unwrap_or_else(|e| panic!("failed to compile protos {e:?}")); Ok(()) } diff --git a/libs/wal_decoder/src/models/record.rs b/libs/wal_decoder/src/models/record.rs index 73516c5220..51659ed904 100644 --- a/libs/wal_decoder/src/models/record.rs +++ b/libs/wal_decoder/src/models/record.rs @@ -128,6 +128,6 @@ pub fn describe_wal_record(rec: &NeonWalRecord) -> Result Ok(format!("{:?}", rec)), + _ => Ok(format!("{rec:?}")), } } diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs index b89f1877fd..7c6abf252e 100644 --- a/libs/walproposer/src/api_bindings.rs +++ b/libs/walproposer/src/api_bindings.rs @@ -376,7 +376,7 @@ impl Level { FATAL => Level::Fatal, PANIC => Level::Panic, WPEVENT => Level::WPEvent, - _ => panic!("unknown log level {}", elevel), + _ => panic!("unknown log level {elevel}"), } } } @@ -446,7 +446,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState { impl std::fmt::Display for Level { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs index c853658ddf..93bb0d5eb0 100644 --- a/libs/walproposer/src/walproposer.rs +++ b/libs/walproposer/src/walproposer.rs @@ -380,7 +380,7 @@ mod tests { } fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool { - println!("conn_send_query: {}", query); + println!("conn_send_query: {query}"); true } @@ -399,13 +399,13 @@ mod tests { ) -> crate::bindings::PGAsyncReadResult { println!("conn_async_read"); let reply = self.next_safekeeper_reply(); - println!("conn_async_read result: {:?}", reply); + println!("conn_async_read result: {reply:?}"); vec.extend_from_slice(reply); crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS } fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool { - println!("conn_blocking_write: {:?}", buf); + println!("conn_blocking_write: {buf:?}"); self.check_walproposer_msg(buf); true } @@ -456,10 +456,7 @@ mod tests { timeout_millis: i64, ) -> super::WaitResult { let data = self.wait_events.get(); - println!( - "wait_event_set, timeout_millis={}, res={:?}", - timeout_millis, data - ); + println!("wait_event_set, timeout_millis={timeout_millis}, res={data:?}"); super::WaitResult::Network(data.sk, data.event_mask) } @@ -475,7 +472,7 @@ mod tests { } fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) { - println!("wp_log[{}] {}", level, msg); + println!("wp_log[{level}] {msg}"); } fn after_election(&self, _wp: &mut crate::bindings::WalProposer) { diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 219e63c9d4..8b091684eb 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -508,11 +508,11 @@ impl Client { .expect("Cannot build URL"); path.query_pairs_mut() - .append_pair("recurse", &format!("{}", recurse)); + .append_pair("recurse", &format!("{recurse}")); if let Some(concurrency) = concurrency { path.query_pairs_mut() - .append_pair("concurrency", &format!("{}", concurrency)); + .append_pair("concurrency", &format!("{concurrency}")); } self.request(Method::POST, path, ()).await.map(|_| ()) diff --git a/pageserver/compaction/src/simulator/draw.rs b/pageserver/compaction/src/simulator/draw.rs index 3d35d1b91e..8322fe7d6d 100644 --- a/pageserver/compaction/src/simulator/draw.rs +++ b/pageserver/compaction/src/simulator/draw.rs @@ -152,7 +152,7 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: let key_diff = key_end - key_start; if key_start >= key_end { - panic!("Invalid key range {}-{}", key_start, key_end); + panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = lsn_map.map(f.lsn_range.start); @@ -212,12 +212,12 @@ pub fn draw_history(history: &[LayerTraceEvent], mut output: )?; writeln!(svg, "")?; } - Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } files_seen.insert(f); } - writeln!(svg, "{}", EndSvg)?; + writeln!(svg, "{EndSvg}")?; let mut layer_events_str = String::new(); let mut first = true; diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 881ebd49a7..2135d302c1 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -228,7 +228,7 @@ pub fn main() -> Result<()> { let lsn_max = lsn_map.len(); if key_start >= key_end { - panic!("Invalid key range {}-{}", key_start, key_end); + panic!("Invalid key range {key_start}-{key_end}"); } let lsn_start = *lsn_map.get(&lsnr.start).unwrap(); @@ -250,7 +250,7 @@ pub fn main() -> Result<()> { ymargin = 0.05; fill = Fill::Color(rgb(0, 0, 0)); } - Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end), + Ordering::Greater => panic!("Invalid lsn range {lsn_start}-{lsn_end}"), } println!( @@ -287,10 +287,10 @@ pub fn main() -> Result<()> { ); } - println!("{}", EndSvg); + println!("{EndSvg}"); - eprintln!("num_images: {}", num_images); - eprintln!("num_deltas: {}", num_deltas); + eprintln!("num_images: {num_images}"); + eprintln!("num_deltas: {num_deltas}"); Ok(()) } diff --git a/pageserver/ctl/src/key.rs b/pageserver/ctl/src/key.rs index 600f7c412e..c4daafdfd0 100644 --- a/pageserver/ctl/src/key.rs +++ b/pageserver/ctl/src/key.rs @@ -372,7 +372,7 @@ impl std::fmt::Debug for RelTagish { f.write_char('/')?; } first = false; - write!(f, "{}", x) + write!(f, "{x}") }) } } diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index c49c8b58df..ef844fbd0f 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -224,8 +224,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { } } println!( - "Total delta layers {} image layers {} excess layers {}", - total_delta_layers, total_image_layers, total_excess_layers + "Total delta layers {total_delta_layers} image layers {total_image_layers} excess layers {total_excess_layers}" ); Ok(()) } diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index aa4774c056..3977ce7c23 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -131,7 +131,7 @@ impl Client { let domain_stream = response_stream.map(|chunk_res| { chunk_res.and_then(|proto_chunk| { proto_chunk.try_into().map_err(|e| { - tonic::Status::internal(format!("Failed to convert response chunk: {}", e)) + tonic::Status::internal(format!("Failed to convert response chunk: {e}")) }) }) }); diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index 6441c047c2..43d7a73399 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -62,7 +62,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id); let timeline_id = timeline.timeline_id; - println!("operating on timeline {}", timeline); + println!("operating on timeline {timeline}"); mgmt_api_client .set_tenant_config(&TenantConfigRequest { @@ -75,8 +75,8 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let items = (0..100) .map(|id| { ( - format!("pg_logical/mappings/{:03}.{:03}", batch, id), - format!("{:08}", id), + format!("pg_logical/mappings/{batch:03}.{id:03}"), + format!("{id:08}"), ) }) .collect::>(); diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index fe136b8bbd..c4aaff58a1 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -669,7 +669,7 @@ where } // Append dir path for each database - let path = format!("base/{}", dbnode); + let path = format!("base/{dbnode}"); let header = new_tar_header_dir(&path)?; self.ar .append(&header, io::empty()) @@ -677,7 +677,7 @@ where .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?; if let Some(img) = relmap_img { - let dst_path = format!("base/{}/PG_VERSION", dbnode); + let dst_path = format!("base/{dbnode}/PG_VERSION"); let pg_version_str = match self.timeline.pg_version { 14 | 15 => self.timeline.pg_version.to_string(), @@ -689,7 +689,7 @@ where .await .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?; - let relmap_path = format!("base/{}/pg_filenode.map", dbnode); + let relmap_path = format!("base/{dbnode}/pg_filenode.map"); let header = new_tar_header(&relmap_path, img.len() as u64)?; self.ar .append(&header, &img[..]) @@ -714,9 +714,9 @@ where let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); let path = if self.timeline.pg_version < 17 { - format!("pg_twophase/{:>08X}", xid) + format!("pg_twophase/{xid:>08X}") } else { - format!("pg_twophase/{:>016X}", xid) + format!("pg_twophase/{xid:>016X}") }; let header = new_tar_header(&path, buf.len() as u64)?; self.ar @@ -768,7 +768,7 @@ where //send wal segment let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE); let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE); - let wal_file_path = format!("pg_wal/{}", wal_file_name); + let wal_file_path = format!("pg_wal/{wal_file_name}"); let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?; let wal_seg = postgres_ffi::generate_wal_segment( diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2e9cd3ad70..349bc6dba6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -287,11 +287,11 @@ impl From for ApiError { GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => { ApiError::ShuttingDown } - GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)), + GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{e}")), GetActiveTenantError::Cancelled => ApiError::ShuttingDown, GetActiveTenantError::NotFound(gte) => gte.into(), GetActiveTenantError::WaitForActiveTimeout { .. } => { - ApiError::ResourceUnavailable(format!("{}", e).into()) + ApiError::ResourceUnavailable(format!("{e}").into()) } GetActiveTenantError::SwitchedTenant => { // in our HTTP handlers, this error doesn't happen @@ -1015,7 +1015,7 @@ async fn get_lsn_by_timestamp_handler( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let timestamp_raw = must_get_query_param(&request, "timestamp")?; let timestamp = humantime::parse_rfc3339(×tamp_raw) - .with_context(|| format!("Invalid time: {:?}", timestamp_raw)) + .with_context(|| format!("Invalid time: {timestamp_raw:?}")) .map_err(ApiError::BadRequest)?; let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); @@ -1110,7 +1110,7 @@ async fn get_timestamp_of_lsn_handler( json_response(StatusCode::OK, time) } None => Err(ApiError::PreconditionFailed( - format!("Timestamp for lsn {} not found", lsn).into(), + format!("Timestamp for lsn {lsn} not found").into(), )), } } @@ -2421,7 +2421,7 @@ async fn timeline_offload_handler( } if let (false, reason) = timeline.can_offload() { return Err(ApiError::PreconditionFailed( - format!("Timeline::can_offload() check failed: {}", reason) .into(), + format!("Timeline::can_offload() check failed: {reason}") .into(), )); } offload_timeline(&tenant, &timeline) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 8d6d342cf9..7929b094b4 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1727,12 +1727,7 @@ impl Drop for SmgrOpTimer { impl SmgrOpFlushInProgress { /// The caller must guarantee that `socket_fd`` outlives this function. - pub(crate) async fn measure( - self, - started_at: Instant, - mut fut: Fut, - socket_fd: RawFd, - ) -> O + pub(crate) async fn measure(self, started_at: Instant, fut: Fut, socket_fd: RawFd) -> O where Fut: std::future::Future, { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 032db34983..d3a1ca681e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -392,16 +392,14 @@ async fn page_service_conn_main( } else { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); Err(io_error).context(format!( - "Postgres connection error for tenant_id={:?} client at peer_addr={}", - tenant_id, peer_addr + "Postgres connection error for tenant_id={tenant_id:?} client at peer_addr={peer_addr}" )) } } other => { let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); other.context(format!( - "Postgres query error for tenant_id={:?} client peer_addr={}", - tenant_id, peer_addr + "Postgres query error for tenant_id={tenant_id:?} client peer_addr={peer_addr}" )) } } @@ -2140,8 +2138,7 @@ impl PageServerHandler { if request_lsn < not_modified_since { return Err(PageStreamError::BadRequest( format!( - "invalid request with request LSN {} and not_modified_since {}", - request_lsn, not_modified_since, + "invalid request with request LSN {request_lsn} and not_modified_since {not_modified_since}", ) .into(), )); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 58af2548ee..180a5b76e8 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1185,7 +1185,7 @@ impl Timeline { } let origin_id = k.field6 as RepOriginId; let origin_lsn = Lsn::des(&v) - .with_context(|| format!("decode replorigin value for {}: {v:?}", origin_id))?; + .with_context(|| format!("decode replorigin value for {origin_id}: {v:?}"))?; if origin_lsn != Lsn::INVALID { result.insert(origin_id, origin_lsn); } @@ -2440,8 +2440,7 @@ impl DatadirModification<'_> { if path == p { assert!( modifying_file.is_none(), - "duplicated entries found for {}", - path + "duplicated entries found for {path}" ); modifying_file = Some(content); } else { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2c2fdef93..a48bf15246 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3449,7 +3449,7 @@ impl TenantShard { use pageserver_api::models::ActivatingFrom; match &*current_state { TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => { - panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state); + panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {current_state:?}"); } TenantState::Attaching => { *current_state = TenantState::Activating(ActivatingFrom::Attaching); @@ -6616,7 +6616,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6626,7 +6626,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6640,7 +6640,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -6650,7 +6650,7 @@ mod tests { .put( *TEST_KEY, lsn, - &Value::Image(test_img(&format!("foo at {}", lsn))), + &Value::Image(test_img(&format!("foo at {lsn}"))), ctx, ) .await?; @@ -7149,7 +7149,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), ctx, ) .await?; @@ -7437,7 +7437,7 @@ mod tests { .put( gap_at_key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))), + &Value::Image(test_img(&format!("{gap_at_key} at {current_lsn}"))), &ctx, ) .await?; @@ -7476,7 +7476,7 @@ mod tests { .put( current_key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))), + &Value::Image(test_img(&format!("{current_key} at {current_lsn}"))), &ctx, ) .await?; @@ -7584,7 +7584,7 @@ mod tests { while key < end_key { current_lsn += 0x10; - let image_value = format!("{} at {}", child_gap_at_key, current_lsn); + let image_value = format!("{child_gap_at_key} at {current_lsn}"); let mut writer = parent_timeline.writer().await; writer @@ -7627,7 +7627,7 @@ mod tests { .put( key, current_lsn, - &Value::Image(test_img(&format!("{} at {}", key, current_lsn))), + &Value::Image(test_img(&format!("{key} at {current_lsn}"))), &ctx, ) .await?; @@ -7748,7 +7748,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7769,7 +7769,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7783,7 +7783,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -7829,7 +7829,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -7858,11 +7858,11 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; - println!("updating {} at {}", blknum, lsn); + println!("updating {blknum} at {lsn}"); writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; @@ -7873,7 +7873,7 @@ mod tests { test_key.field6 = blknum as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -7926,11 +7926,11 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))), + &Value::Image(test_img(&format!("{idx} {blknum} at {lsn}"))), &ctx, ) .await?; - println!("updating [{}][{}] at {}", idx, blknum, lsn); + println!("updating [{idx}][{blknum}] at {lsn}"); writer.finish_write(lsn); drop(writer); updated[idx][blknum] = lsn; @@ -8136,7 +8136,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8153,7 +8153,7 @@ mod tests { test_key.field6 = (blknum * STEP) as u32; assert_eq!( tline.get(test_key, lsn, &ctx).await?, - test_img(&format!("{} at {}", blknum, last_lsn)) + test_img(&format!("{blknum} at {last_lsn}")) ); } @@ -8190,7 +8190,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8443,7 +8443,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -8463,7 +8463,7 @@ mod tests { .put( test_key, lsn, - &Value::Image(test_img(&format!("{} at {}", blknum, lsn))), + &Value::Image(test_img(&format!("{blknum} at {lsn}"))), &ctx, ) .await?; @@ -9384,12 +9384,7 @@ mod tests { let end_lsn = Lsn(0x100); let image_layers = (0x20..=0x90) .step_by(0x10) - .map(|n| { - ( - Lsn(n), - vec![(key, test_img(&format!("data key at {:x}", n)))], - ) - }) + .map(|n| (Lsn(n), vec![(key, test_img(&format!("data key at {n:x}")))])) .collect(); let timeline = tenant diff --git a/pageserver/src/tenant/checks.rs b/pageserver/src/tenant/checks.rs index d5b979ab2a..83d54f09de 100644 --- a/pageserver/src/tenant/checks.rs +++ b/pageserver/src/tenant/checks.rs @@ -63,8 +63,7 @@ pub fn check_valid_layermap(metadata: &[LayerName]) -> Option { && overlaps_with(&layer.key_range, &other_layer.key_range) { let err = format!( - "layer violates the layer map LSN split assumption: layer {} intersects with layer {}", - layer, other_layer + "layer violates the layer map LSN split assumption: layer {layer} intersects with layer {other_layer}" ); return Some(err); } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index bea3128265..5081d7f5a4 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -550,8 +550,7 @@ mod tests { assert_eq!( deserialized_metadata.body, expected_metadata.body, - "Metadata of the old version {} should be upgraded to the latest version {}", - METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION + "Metadata of the old version {METADATA_OLD_FORMAT_VERSION} should be upgraded to the latest version {METADATA_FORMAT_VERSION}" ); } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index dd49c843f3..6b315dc4bc 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1427,7 +1427,7 @@ async fn init_timeline_state( let local_meta = dentry .metadata() .await - .fatal_err(&format!("Read metadata on {}", file_path)); + .fatal_err(&format!("Read metadata on {file_path}")); let file_name = file_path.file_name().expect("created it from the dentry"); if crate::is_temporary(&file_path) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index ba763d4c3f..0f31318f0c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -783,7 +783,7 @@ impl DeltaLayer { ctx, ) .await - .with_context(|| format!("Failed to open file '{}'", path))?; + .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; @@ -1401,7 +1401,7 @@ impl DeltaLayerInner { match val { Value::Image(img) => { let checkpoint = CheckPoint::decode(&img)?; - println!(" CHECKPOINT: {:?}", checkpoint); + println!(" CHECKPOINT: {checkpoint:?}"); } Value::WalRecord(_rec) => { println!(" unexpected walrecord value for checkpoint key"); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d6f5f48a6e..9f76f697d3 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -272,8 +272,7 @@ impl ImageLayer { conf.timeline_path(&tenant_shard_id, &timeline_id) .join(format!( - "{fname}.{:x}.{TEMP_FILE_SUFFIX}", - filename_disambiguator + "{fname}.{filename_disambiguator:x}.{TEMP_FILE_SUFFIX}" )) } @@ -370,7 +369,7 @@ impl ImageLayer { ctx, ) .await - .with_context(|| format!("Failed to open file '{}'", path))?; + .with_context(|| format!("Failed to open file '{path}'"))?; let file_id = page_cache::next_file_id(); let block_reader = FileBlockReader::new(&file, file_id); let summary_blk = block_reader.read_blk(0, ctx).await?; @@ -1475,7 +1474,7 @@ mod test { assert_eq!(l1, expect_lsn); assert_eq!(&i1, i2); } - (o1, o2) => panic!("iterators length mismatch: {:?}, {:?}", o1, o2), + (o1, o2) => panic!("iterators length mismatch: {o1:?}, {o2:?}"), } } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 8e5b0ba648..c4d53c6405 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -511,7 +511,7 @@ fn inmem_layer_log_display( start_lsn: Lsn, end_lsn: Lsn, ) -> std::fmt::Result { - write!(f, "timeline {} in-memory ", timeline)?; + write!(f, "timeline {timeline} in-memory ")?; inmem_layer_display(f, start_lsn, end_lsn) } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index ea354fc716..27fbc6f5fb 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -380,7 +380,7 @@ impl std::fmt::Debug for LogicalReadState { write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) } LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), - LogicalReadState::Error(e) => write!(f, "Error({:?})", e), + LogicalReadState::Error(e) => write!(f, "Error({e:?})"), LogicalReadState::Undefined => write!(f, "Undefined"), } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 3d55972017..0be13e67a8 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -105,7 +105,7 @@ impl std::fmt::Display for Layer { impl std::fmt::Debug for Layer { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1bb17af146..81f2646e5a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -178,7 +178,7 @@ pub enum LastImageLayerCreationStatus { impl std::fmt::Display for ImageLayerCreationMode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self) + write!(f, "{self:?}") } } @@ -632,7 +632,7 @@ pub enum ReadPathLayerId { impl std::fmt::Display for ReadPathLayerId { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key), + ReadPathLayerId::PersistentLayer(key) => write!(f, "{key}"), ReadPathLayerId::InMemoryLayer(range) => { write!(f, "in-mem {}..{}", range.start, range.end) } @@ -708,7 +708,7 @@ impl MissingKeyError { impl std::fmt::Debug for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } @@ -721,19 +721,19 @@ impl std::fmt::Display for MissingKeyError { )?; if let Some(ref ancestor_lsn) = self.ancestor_lsn { - write!(f, ", ancestor {}", ancestor_lsn)?; + write!(f, ", ancestor {ancestor_lsn}")?; } if let Some(ref query) = self.query { - write!(f, ", query {}", query)?; + write!(f, ", query {query}")?; } if let Some(ref read_path) = self.read_path { - write!(f, "\n{}", read_path)?; + write!(f, "\n{read_path}")?; } if let Some(ref backtrace) = self.backtrace { - write!(f, "\n{}", backtrace)?; + write!(f, "\n{backtrace}")?; } Ok(()) @@ -7179,9 +7179,7 @@ impl Timeline { if let Some(end) = layer_end_lsn { assert!( end <= last_record_lsn, - "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", - end, - last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={end}, last_record_lsn={last_record_lsn}", ); } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6039c002f7..1b8e5f4b9c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -977,7 +977,7 @@ impl KeyHistoryRetention { tline .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) .await - .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?; + .with_context(|| format!("verification failed for key {key} at lsn {lsn}"))?; Ok(()) } @@ -2647,15 +2647,15 @@ impl Timeline { use std::fmt::Write; let mut output = String::new(); if let Some((key, _, _)) = replay_history.first() { - write!(output, "key={} ", key).unwrap(); + write!(output, "key={key} ").unwrap(); let mut cnt = 0; for (_, lsn, val) in replay_history { if val.is_image() { - write!(output, "i@{} ", lsn).unwrap(); + write!(output, "i@{lsn} ").unwrap(); } else if val.will_init() { - write!(output, "di@{} ", lsn).unwrap(); + write!(output, "di@{lsn} ").unwrap(); } else { - write!(output, "d@{} ", lsn).unwrap(); + write!(output, "d@{lsn} ").unwrap(); } cnt += 1; if cnt >= 128 { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 343e04f5f0..e91bd5d43a 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -360,8 +360,7 @@ pub(super) async fn handle_walreceiver_connection( match raw_wal_start_lsn.cmp(&expected_wal_start) { std::cmp::Ordering::Greater => { let msg = format!( - "Gap in streamed WAL: [{}, {})", - expected_wal_start, raw_wal_start_lsn + "Gap in streamed WAL: [{expected_wal_start}, {raw_wal_start_lsn})" ); critical!("{msg}"); return Err(WalReceiverError::Other(anyhow!(msg))); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 090d2ece85..85ea5c4d80 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -68,16 +68,9 @@ impl AlignedBuffer { assert!( begin <= end, - "range start must not be greater than end: {:?} <= {:?}", - begin, - end, - ); - assert!( - end <= len, - "range end out of bounds: {:?} <= {:?}", - end, - len, + "range start must not be greater than end: {begin:?} <= {end:?}", ); + assert!(end <= len, "range end out of bounds: {end:?} <= {len:?}",); let begin = self.range.start + begin; let end = self.range.start + end; diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index 07f949b89e..93116ea85e 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -242,10 +242,7 @@ unsafe impl bytes::BufMut for AlignedBufferMut { /// Panic with a nice error message. #[cold] fn panic_advance(idx: usize, len: usize) -> ! { - panic!( - "advance out of bounds: the len is {} but advancing by {}", - len, idx - ); + panic!("advance out of bounds: the len is {len} but advancing by {idx}"); } /// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index ebffaf70e2..c452f48e40 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -2108,7 +2108,7 @@ mod tests { // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2142,7 +2142,7 @@ mod tests { for blkno in 0..1 { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2167,7 +2167,7 @@ mod tests { ); for blkno in 0..relsize { let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2188,7 +2188,7 @@ mod tests { let lsn = Lsn(0x80); let mut m = tline.begin_modification(lsn); for blkno in 0..relsize { - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); walingest .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx) .await?; @@ -2210,7 +2210,7 @@ mod tests { // Check relation content for blkno in 0..relsize { let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); + let data = format!("foo blk {blkno} at {lsn}"); assert_eq!( tline .get_rel_page_at_lsn( @@ -2414,6 +2414,6 @@ mod tests { } let duration = started_at.elapsed(); - println!("done in {:?}", duration); + println!("done in {duration:?}"); } } diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 0783c77622..a525579082 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -52,8 +52,7 @@ pub(crate) fn apply_in_neon( let (rel, _) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, - "TruncateVisibilityMap record on unexpected rel {}", - rel + "TruncateVisibilityMap record on unexpected rel {rel}" ); let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; map[*trunc_byte + 1..].fill(0u8); @@ -78,8 +77,7 @@ pub(crate) fn apply_in_neon( let (rel, blknum) = key.to_rel_block().context("invalid record")?; assert!( rel.forknum == VISIBILITYMAP_FORKNUM, - "ClearVisibilityMapFlags record on unexpected rel {}", - rel + "ClearVisibilityMapFlags record on unexpected rel {rel}" ); if let Some(heap_blkno) = *new_heap_blkno { // Calculate the VM block and offset that corresponds to the heap block. @@ -124,8 +122,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::Clog, - "ClogSetCommitted record with unexpected key {}", - key + "ClogSetCommitted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; @@ -135,15 +132,11 @@ pub(crate) fn apply_in_neon( // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key + "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "ClogSetCommitted record for XID {} with unexpected key {}", - xid, - key + "ClogSetCommitted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page); @@ -169,8 +162,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::Clog, - "ClogSetAborted record with unexpected key {}", - key + "ClogSetAborted record with unexpected key {key}" ); for &xid in xids { let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; @@ -180,15 +172,11 @@ pub(crate) fn apply_in_neon( // Check that we're modifying the correct CLOG block. assert!( segno == expected_segno, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key + "ClogSetAborted record for XID {xid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "ClogSetAborted record for XID {} with unexpected key {}", - xid, - key + "ClogSetAborted record for XID {xid} with unexpected key {key}" ); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); @@ -199,8 +187,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::MultiXactOffsets, - "MultixactOffsetCreate record with unexpected key {}", - key + "MultixactOffsetCreate record with unexpected key {key}" ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. @@ -213,15 +200,11 @@ pub(crate) fn apply_in_neon( let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key + "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", - mid, - key + "MultiXactOffsetsCreate record for multi-xid {mid} with unexpected key {key}" ); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); @@ -231,8 +214,7 @@ pub(crate) fn apply_in_neon( assert_eq!( slru_kind, SlruKind::MultiXactMembers, - "MultixactMembersCreate record with unexpected key {}", - key + "MultixactMembersCreate record with unexpected key {key}" ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; @@ -249,15 +231,11 @@ pub(crate) fn apply_in_neon( let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( segno == expected_segno, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key + "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); assert!( blknum == expected_blknum, - "MultiXactMembersCreate record for offset {} with unexpected key {}", - moff, - key + "MultiXactMembersCreate record for offset {moff} with unexpected key {key}" ); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index d96f582fad..263d784e78 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -61,6 +61,10 @@ clippy::too_many_lines, clippy::unused_self )] +#![allow( + clippy::unsafe_derive_deserialize, + reason = "false positive: https://github.com/rust-lang/rust-clippy/issues/15120" +)] #![cfg_attr( any(test, feature = "testing"), allow( diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index 1bf3e4cac1..4fc62fb229 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -206,16 +206,10 @@ impl Storage for FileStorage { let buf: Vec = s.write_to_buf()?; control_partial.write_all(&buf).await.with_context(|| { - format!( - "failed to write safekeeper state into control file at: {}", - control_partial_path - ) + format!("failed to write safekeeper state into control file at: {control_partial_path}") })?; control_partial.flush().await.with_context(|| { - format!( - "failed to flush safekeeper state into control file at: {}", - control_partial_path - ) + format!("failed to flush safekeeper state into control file at: {control_partial_path}") })?; let control_path = self.timeline_dir.join(CONTROL_FILE_NAME); diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index b54bee8bfb..5e7f1d8758 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -73,7 +73,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap(); let caps = re .captures(cmd) - .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?; + .context(format!("failed to parse START_WAL_PUSH command {cmd}"))?; // capture () content let options = caps.get(2).map(|m| m.as_str()).unwrap_or(""); // default values @@ -85,24 +85,20 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { } let mut kvit = kvstr.split_whitespace(); let key = kvit.next().context(format!( - "failed to parse key in kv {} in command {}", - kvstr, cmd + "failed to parse key in kv {kvstr} in command {cmd}" ))?; let value = kvit.next().context(format!( - "failed to parse value in kv {} in command {}", - kvstr, cmd + "failed to parse value in kv {kvstr} in command {cmd}" ))?; let value_trimmed = value.trim_matches('\''); if key == "proto_version" { proto_version = value_trimmed.parse::().context(format!( - "failed to parse proto_version value {} in command {}", - value, cmd + "failed to parse proto_version value {value} in command {cmd}" ))?; } if key == "allow_timeline_creation" { allow_timeline_creation = value_trimmed.parse::().context(format!( - "failed to parse allow_timeline_creation value {} in command {}", - value, cmd + "failed to parse allow_timeline_creation value {value} in command {cmd}" ))?; } } @@ -118,7 +114,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { .unwrap(); let caps = re .captures(cmd) - .context(format!("failed to parse START_REPLICATION command {}", cmd))?; + .context(format!("failed to parse START_REPLICATION command {cmd}"))?; let start_lsn = Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?; let term = if let Some(m) = caps.get(2) { diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 886cac869d..aa19b6d283 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -64,10 +64,10 @@ impl TermHistory { for i in 0..n_entries { let term = bytes .get_u64_f() - .with_context(|| format!("TermHistory pos {} misses term", i))?; + .with_context(|| format!("TermHistory pos {i} misses term"))?; let lsn = bytes .get_u64_f() - .with_context(|| format!("TermHistory pos {} misses lsn", i))? + .with_context(|| format!("TermHistory pos {i} misses lsn"))? .into(); res.push(TermLsn { term, lsn }) } @@ -121,9 +121,7 @@ impl TermHistory { if let Some(sk_th_last) = sk_th.last() { assert!( sk_th_last.lsn <= sk_wal_end, - "safekeeper term history end {:?} LSN is higher than WAL end {:?}", - sk_th_last, - sk_wal_end + "safekeeper term history end {sk_th_last:?} LSN is higher than WAL end {sk_wal_end:?}" ); } @@ -438,11 +436,11 @@ impl ProposerAcceptorMessage { for i in 0..members_len { let id = buf .get_u64_f() - .with_context(|| format!("reading member {} node_id", i))?; - let host = Self::get_cstr(buf).with_context(|| format!("reading member {} host", i))?; + .with_context(|| format!("reading member {i} node_id"))?; + let host = Self::get_cstr(buf).with_context(|| format!("reading member {i} host"))?; let pg_port = buf .get_u16_f() - .with_context(|| format!("reading member {} port", i))?; + .with_context(|| format!("reading member {i} port"))?; let sk = SafekeeperId { id: NodeId(id), host, @@ -463,12 +461,12 @@ impl ProposerAcceptorMessage { for i in 0..new_members_len { let id = buf .get_u64_f() - .with_context(|| format!("reading new member {} node_id", i))?; - let host = Self::get_cstr(buf) - .with_context(|| format!("reading new member {} host", i))?; + .with_context(|| format!("reading new member {i} node_id"))?; + let host = + Self::get_cstr(buf).with_context(|| format!("reading new member {i} host"))?; let pg_port = buf .get_u16_f() - .with_context(|| format!("reading new member {} port", i))?; + .with_context(|| format!("reading new member {i} port"))?; let sk = SafekeeperId { id: NodeId(id), host, @@ -1508,7 +1506,7 @@ mod tests { let mut vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given), - r => panic!("unexpected response: {:?}", r), + r => panic!("unexpected response: {r:?}"), } // reboot... @@ -1523,7 +1521,7 @@ mod tests { vote_resp = sk.process_msg(&vote_request).await; match vote_resp.unwrap() { Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(!resp.vote_given), - r => panic!("unexpected response: {:?}", r), + r => panic!("unexpected response: {r:?}"), } } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index e817dbf6f9..47b65a579a 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -342,7 +342,7 @@ where let bytes_read1 = reader1 .read(&mut buffer1[..bytes_to_read]) .await - .with_context(|| format!("failed to read from reader1 at offset {}", offset))?; + .with_context(|| format!("failed to read from reader1 at offset {offset}"))?; if bytes_read1 == 0 { anyhow::bail!("unexpected EOF from reader1 at offset {}", offset); } @@ -351,10 +351,7 @@ where .read_exact(&mut buffer2[..bytes_read1]) .await .with_context(|| { - format!( - "failed to read {} bytes from reader2 at offset {}", - bytes_read1, offset - ) + format!("failed to read {bytes_read1} bytes from reader2 at offset {offset}") })?; assert!(bytes_read2 == bytes_read1); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 48eda92fed..a68752bfdd 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -108,7 +108,7 @@ impl std::fmt::Debug for ManagerCtlMessage { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"), - ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({id:?})"), ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 6e41ada1b3..a81a7298a9 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -147,7 +147,7 @@ impl GlobalTimelines { }; let mut tenant_count = 0; for tenants_dir_entry in std::fs::read_dir(&tenants_dir) - .with_context(|| format!("failed to list tenants dir {}", tenants_dir))? + .with_context(|| format!("failed to list tenants dir {tenants_dir}"))? { match &tenants_dir_entry { Ok(tenants_dir_entry) => { @@ -188,7 +188,7 @@ impl GlobalTimelines { let timelines_dir = get_tenant_dir(&conf, &tenant_id); for timelines_dir_entry in std::fs::read_dir(&timelines_dir) - .with_context(|| format!("failed to list timelines dir {}", timelines_dir))? + .with_context(|| format!("failed to list timelines dir {timelines_dir}"))? { match &timelines_dir_entry { Ok(timeline_dir_entry) => { diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index fe0f1b3607..cdf68262dd 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -364,8 +364,7 @@ impl PartialBackup { // there should always be zero or one uploaded segment assert!( new_segments.is_empty(), - "too many uploaded segments: {:?}", - new_segments + "too many uploaded segments: {new_segments:?}" ); } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 8ba3e7cc47..e68c9f3a99 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -841,7 +841,7 @@ pub(crate) async fn open_wal_file( // If that failed, try it without the .partial extension. let pf = tokio::fs::File::open(&wal_file_path) .await - .with_context(|| format!("failed to open WAL file {:#}", wal_file_path)) + .with_context(|| format!("failed to open WAL file {wal_file_path:#}")) .map_err(|e| { warn!("{}", e); e diff --git a/safekeeper/tests/walproposer_sim/log.rs b/safekeeper/tests/walproposer_sim/log.rs index e2ba3282ca..cecbc859e6 100644 --- a/safekeeper/tests/walproposer_sim/log.rs +++ b/safekeeper/tests/walproposer_sim/log.rs @@ -33,7 +33,7 @@ impl FormatTime for SimClock { if let Some(clock) = clock.as_ref() { let now = clock.now(); - write!(w, "[{}]", now) + write!(w, "[{now}]") } else { write!(w, "[?]") } diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 5fb29683f2..1fdf8e4949 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -257,7 +257,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { let estr = e.to_string(); if !estr.contains("finished processing START_REPLICATION") { warn!("conn {:?} error: {:?}", connection_id, e); - panic!("unexpected error at safekeeper: {:#}", e); + panic!("unexpected error at safekeeper: {e:#}"); } conns.remove(&connection_id); break; diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs index 70fecfbe22..edd3bf2d9e 100644 --- a/safekeeper/tests/walproposer_sim/simulation.rs +++ b/safekeeper/tests/walproposer_sim/simulation.rs @@ -217,7 +217,7 @@ impl TestConfig { ]; let server_ids = [servers[0].id, servers[1].id, servers[2].id]; - let safekeepers_addrs = server_ids.map(|id| format!("node:{}", id)).to_vec(); + let safekeepers_addrs = server_ids.map(|id| format!("node:{id}")).to_vec(); let ttid = TenantTimelineId::generate(); diff --git a/safekeeper/tests/walproposer_sim/walproposer_api.rs b/safekeeper/tests/walproposer_sim/walproposer_api.rs index c2604c4bdc..29b361db7e 100644 --- a/safekeeper/tests/walproposer_sim/walproposer_api.rs +++ b/safekeeper/tests/walproposer_sim/walproposer_api.rs @@ -523,7 +523,7 @@ impl ApiImpl for SimulationApi { // Voting bug when safekeeper disconnects after voting executor::exit(1, msg.to_owned()); } - panic!("unknown FATAL error from walproposer: {}", msg); + panic!("unknown FATAL error from walproposer: {msg}"); } } @@ -544,10 +544,7 @@ impl ApiImpl for SimulationApi { } } - let msg = format!( - "prop_elected;{};{};{};{}", - prop_lsn, prop_term, prev_lsn, prev_term - ); + let msg = format!("prop_elected;{prop_lsn};{prop_term};{prev_lsn};{prev_term}"); debug!(msg); self.os.log_event(msg); diff --git a/storage_broker/benches/rps.rs b/storage_broker/benches/rps.rs index 9953ccfa91..5f3e594687 100644 --- a/storage_broker/benches/rps.rs +++ b/storage_broker/benches/rps.rs @@ -161,7 +161,7 @@ async fn publish(client: Option, n_keys: u64) { } }; let response = client.publish_safekeeper_info(Request::new(outbound)).await; - println!("pub response is {:?}", response); + println!("pub response is {response:?}"); } #[tokio::main] diff --git a/storage_broker/build.rs b/storage_broker/build.rs index 08dadeacd5..77c441dddd 100644 --- a/storage_broker/build.rs +++ b/storage_broker/build.rs @@ -6,6 +6,6 @@ fn main() -> Result<(), Box> { // the build then. Anyway, per cargo docs build script shouldn't output to // anywhere but $OUT_DIR. tonic_build::compile_protos("proto/broker.proto") - .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e)); + .unwrap_or_else(|e| panic!("failed to compile protos {e:?}")); Ok(()) } diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 149656a191..7d8b57380f 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -86,13 +86,9 @@ impl BrokerClientChannel { #[allow(clippy::result_large_err, reason = "TODO")] pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result { let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id) - .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?; - let timeline_id = TimelineId::from_slice(&proto_ttid.timeline_id).map_err(|e| { - Status::new( - Code::InvalidArgument, - format!("malformed timeline_id: {}", e), - ) - })?; + .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {e}")))?; + let timeline_id = TimelineId::from_slice(&proto_ttid.timeline_id) + .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed timeline_id: {e}")))?; Ok(TenantTimelineId { tenant_id, timeline_id, diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index a4482a4dac..0b5569b3d6 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -370,7 +370,7 @@ impl ComputeHook { let authorization_header = config .control_plane_jwt_token .clone() - .map(|jwt| format!("Bearer {}", jwt)); + .map(|jwt| format!("Bearer {jwt}")); let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); for cert in &config.ssl_ca_certs { diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs index bd4b8ba38f..0dae7b8147 100644 --- a/storage_controller/src/drain_utils.rs +++ b/storage_controller/src/drain_utils.rs @@ -62,7 +62,7 @@ pub(crate) fn validate_node_state( nodes: Arc>, ) -> Result<(), OperationError> { let node = nodes.get(node_id).ok_or(OperationError::NodeStateChanged( - format!("node {} was removed", node_id).into(), + format!("node {node_id} was removed").into(), ))?; let current_policy = node.get_scheduling(); @@ -70,7 +70,7 @@ pub(crate) fn validate_node_state( // TODO(vlad): maybe cancel pending reconciles before erroring out. need to think // about it return Err(OperationError::NodeStateChanged( - format!("node {} changed state to {:?}", node_id, current_policy).into(), + format!("node {node_id} changed state to {current_policy:?}").into(), )); } @@ -145,7 +145,7 @@ impl TenantShardDrain { if !nodes.contains_key(&destination) { return Err(OperationError::NodeStateChanged( - format!("node {} was removed", destination).into(), + format!("node {destination} was removed").into(), )); } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 346595aa11..a7e86b5224 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -721,9 +721,9 @@ async fn handle_tenant_timeline_passthrough( // Callers will always pass an unsharded tenant ID. Before proxying, we must // rewrite this to a shard-aware shard zero ID. - let path = format!("{}", path); + let path = format!("{path}"); let tenant_str = tenant_or_shard_id.tenant_id.to_string(); - let tenant_shard_str = format!("{}", tenant_shard_id); + let tenant_shard_str = format!("{tenant_shard_id}"); let path = path.replace(&tenant_str, &tenant_shard_str); let latency = &METRICS_REGISTRY @@ -1539,7 +1539,7 @@ async fn handle_ready(req: Request) -> Result, ApiError> { impl From for ApiError { fn from(value: ReconcileError) -> Self { - ApiError::Conflict(format!("Reconciliation error: {}", value)) + ApiError::Conflict(format!("Reconciliation error: {value}")) } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 4300dd32a9..2948e9019f 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -500,15 +500,13 @@ impl Persistence { if let Some(np) = node_to_delete { let lc = NodeLifecycle::from_str(&np.lifecycle).map_err(|e| { DatabaseError::Logical(format!( - "Node {} has invalid lifecycle: {}", - del_node_id, e + "Node {del_node_id} has invalid lifecycle: {e}" )) })?; if lc != NodeLifecycle::Deleted { return Err(DatabaseError::Logical(format!( - "Node {} was not soft deleted before, cannot hard delete it", - del_node_id + "Node {del_node_id} was not soft deleted before, cannot hard delete it" ))); } @@ -642,8 +640,7 @@ impl Persistence { .await?; if deleted_node > 0 { return Err(DatabaseError::Logical(format!( - "Node {} is marked as deleted, re-attach is not allowed", - input_node_id + "Node {input_node_id} is marked as deleted, re-attach is not allowed" ))); } @@ -1003,7 +1000,7 @@ impl Persistence { .execute(conn).await?; if u8::try_from(updated) .map_err(|_| DatabaseError::Logical( - format!("Overflow existing shard count {} while splitting", updated)) + format!("Overflow existing shard count {updated} while splitting")) )? != old_shard_count.count() { // Perhaps a deletion or another split raced with this attempt to split, mutating // the parent shards that we intend to split. In this case the split request should fail. @@ -1343,8 +1340,7 @@ impl Persistence { if inserted_updated != 1 { return Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))); } @@ -1406,8 +1402,7 @@ impl Persistence { 0 => Ok(false), 1 => Ok(true), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))), } }) @@ -1476,8 +1471,7 @@ impl Persistence { 0 => Ok(()), 1 => Ok(()), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - updated + "unexpected number of rows ({updated})" ))), } }) @@ -1570,8 +1564,7 @@ impl Persistence { 0 => Ok(false), 1 => Ok(true), _ => Err(DatabaseError::Logical(format!( - "unexpected number of rows ({})", - inserted_updated + "unexpected number of rows ({inserted_updated})" ))), } }) diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index b3656c33d4..b86b4dfab1 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -23,7 +23,7 @@ pub enum ScheduleError { impl From for ApiError { fn from(value: ScheduleError) -> Self { - ApiError::Conflict(format!("Scheduling error: {}", value)) + ApiError::Conflict(format!("Scheduling error: {value}")) } } @@ -903,7 +903,7 @@ impl Scheduler { /// rigorously updating them on every change. pub(crate) fn update_metrics(&self) { for (node_id, node) in &self.nodes { - let node_id_str = format!("{}", node_id); + let node_id_str = format!("{node_id}"); let label_group = NodeLabelGroup { az: &node.az.0, node_id: &node_id_str, @@ -1326,7 +1326,7 @@ mod tests { .map(|(node_id, node)| (node_id, node.home_shard_count)) .collect::>(); node_home_counts.sort_by_key(|i| i.0); - eprintln!("Selected {}, vs nodes {:?}", preferred_az, node_home_counts); + eprintln!("Selected {preferred_az}, vs nodes {node_home_counts:?}"); let tenant_shard_id = TenantShardId { tenant_id: TenantId::generate(), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 0eb87ffbe3..8424c27cf8 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -260,7 +260,7 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // Presume errors receiving body are connectivity/availability issues except for decoding errors let src_str = err.source().map(|e| e.to_string()).unwrap_or_default(); ApiError::ResourceUnavailable( - format!("{node} error receiving error body: {err} {}", src_str).into(), + format!("{node} error receiving error body: {err} {src_str}").into(), ) } mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, msg) => { @@ -671,7 +671,7 @@ impl std::fmt::Display for StopReconciliationsReason { Self::ShuttingDown => "Shutting down", Self::SteppingDown => "Stepping down", }; - write!(writer, "{}", s) + write!(writer, "{s}") } } @@ -5278,7 +5278,7 @@ impl Service { shard_params, result .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -6201,7 +6201,7 @@ impl Service { }, ) .await - .map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?; + .map_err(|e| ApiError::Conflict(format!("Failed to split {parent_id}: {e}")))?; fail::fail_point!("shard-split-post-remote", |_| Err(ApiError::Conflict( "failpoint".to_string() @@ -6218,7 +6218,7 @@ impl Service { response .new_shards .iter() - .map(|s| format!("{:?}", s)) + .map(|s| format!("{s:?}")) .collect::>() .join(",") ); @@ -7117,8 +7117,7 @@ impl Service { Ok(()) } else { Err(ApiError::Conflict(format!( - "Node {} is in use, consider using tombstone API first", - node_id + "Node {node_id} is in use, consider using tombstone API first" ))) } } @@ -7668,7 +7667,7 @@ impl Service { if let Some(ongoing) = ongoing_op { return Err(ApiError::PreconditionFailed( - format!("Background operation already ongoing for node: {}", ongoing).into(), + format!("Background operation already ongoing for node: {ongoing}").into(), )); } @@ -7799,7 +7798,7 @@ impl Service { if let Some(ongoing) = ongoing_op { return Err(ApiError::PreconditionFailed( - format!("Background operation already ongoing for node: {}", ongoing).into(), + format!("Background operation already ongoing for node: {ongoing}").into(), )); } @@ -8870,7 +8869,7 @@ impl Service { let nodes = self.inner.read().unwrap().nodes.clone(); let node = nodes.get(secondary).ok_or(mgmt_api::Error::ApiError( StatusCode::NOT_FOUND, - format!("Node with id {} not found", secondary), + format!("Node with id {secondary} not found"), ))?; match node @@ -8949,8 +8948,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9054,8 +9052,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9265,8 +9262,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); @@ -9348,8 +9344,7 @@ impl Service { Err(err) => { return Err(OperationError::FinalizeError( format!( - "Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}", - node_id, err + "Failed to finalise drain cancel of {node_id} by setting scheduling policy to Active: {err}" ) .into(), )); diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 789327bfaf..359921ecbf 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -3008,21 +3008,18 @@ pub(crate) mod tests { if attachments_in_wrong_az > 0 { violations.push(format!( - "{} attachments scheduled to the incorrect AZ", - attachments_in_wrong_az + "{attachments_in_wrong_az} attachments scheduled to the incorrect AZ" )); } if secondaries_in_wrong_az > 0 { violations.push(format!( - "{} secondaries scheduled to the incorrect AZ", - secondaries_in_wrong_az + "{secondaries_in_wrong_az} secondaries scheduled to the incorrect AZ" )); } eprintln!( - "attachments_in_wrong_az={} secondaries_in_wrong_az={}", - attachments_in_wrong_az, secondaries_in_wrong_az + "attachments_in_wrong_az={attachments_in_wrong_az} secondaries_in_wrong_az={secondaries_in_wrong_az}" ); for (node_id, stats) in &node_stats { diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index eb50819d02..e88bce4c82 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -195,7 +195,7 @@ impl UpcallClient { let authorization_header = config .control_plane_jwt_token .clone() - .map(|jwt| format!("Bearer {}", jwt)); + .map(|jwt| format!("Bearer {jwt}")); let client = reqwest::ClientBuilder::new() .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT) diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 865f0908f9..774418f237 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -146,7 +146,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( - "index_part.json contains a layer {} that has 0 size in its layer metadata", layer, + "index_part.json contains a layer {layer} that has 0 size in its layer metadata", )) } diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 25a157f108..d3ed5a8357 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -123,7 +123,7 @@ impl S3Target { pub fn with_sub_segment(&self, new_segment: &str) -> Self { let mut new_self = self.clone(); if new_self.prefix_in_bucket.is_empty() { - new_self.prefix_in_bucket = format!("/{}/", new_segment); + new_self.prefix_in_bucket = format!("/{new_segment}/"); } else { if new_self.prefix_in_bucket.ends_with('/') { new_self.prefix_in_bucket.pop(); diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index f10d758097..cf0a3d19e9 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -265,7 +265,7 @@ async fn load_timelines_from_db( // so spawn it off to run on its own. tokio::spawn(async move { if let Err(e) = connection.await { - eprintln!("connection error: {}", e); + eprintln!("connection error: {e}"); } }); @@ -274,7 +274,7 @@ async fn load_timelines_from_db( "and tenant_id in ({})", tenant_ids .iter() - .map(|t| format!("'{}'", t)) + .map(|t| format!("'{t}'")) .collect::>() .join(", ") ) From 4dd9ca7b04d75155a14f6ad51948353f85a1f5fc Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 24 Jun 2025 15:15:36 +0100 Subject: [PATCH 08/50] [proxy]: authenticate to compute after connect_to_compute (#12335) ## Problem PGLB will do the connect_to_compute logic, neonkeeper will do the session establishment logic. We should split it. ## Summary of changes Moves postgres authentication to compute to a separate routine that happens after connect_to_compute. --- proxy/src/compute/mod.rs | 181 +++++++++++++++++----------- proxy/src/console_redirect_proxy.rs | 13 +- proxy/src/control_plane/mod.rs | 8 +- proxy/src/proxy/connect_compute.rs | 15 +-- proxy/src/proxy/mod.rs | 22 ++-- proxy/src/proxy/retry.rs | 2 - 6 files changed, 140 insertions(+), 101 deletions(-) diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 5dd264b35e..f6c58c7459 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -33,12 +33,51 @@ use crate::types::Host; pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] -pub(crate) enum ConnectionError { +pub(crate) enum PostgresError { /// This error doesn't seem to reveal any secrets; for instance, /// `postgres_client::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] Postgres(#[from] postgres_client::Error), +} +impl UserFacingError for PostgresError { + fn to_string_client(&self) -> String { + match self { + // This helps us drop irrelevant library-specific prefixes. + // TODO: propagate severity level and other parameters. + PostgresError::Postgres(err) => match err.as_db_error() { + Some(err) => { + let msg = err.message(); + + if msg.starts_with("unsupported startup parameter: ") + || msg.starts_with("unsupported startup parameter in options: ") + { + format!( + "{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter" + ) + } else { + msg.to_owned() + } + } + None => err.to_string(), + }, + } + } +} + +impl ReportableError for PostgresError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + PostgresError::Postgres(e) if e.as_db_error().is_some() => { + crate::error::ErrorKind::Postgres + } + PostgresError::Postgres(_) => crate::error::ErrorKind::Compute, + } + } +} + +#[derive(Debug, Error)] +pub(crate) enum ConnectionError { #[error("{COULD_NOT_CONNECT}: {0}")] TlsError(#[from] TlsError), @@ -52,22 +91,6 @@ pub(crate) enum ConnectionError { impl UserFacingError for ConnectionError { fn to_string_client(&self) -> String { match self { - // This helps us drop irrelevant library-specific prefixes. - // TODO: propagate severity level and other parameters. - ConnectionError::Postgres(err) => match err.as_db_error() { - Some(err) => { - let msg = err.message(); - - if msg.starts_with("unsupported startup parameter: ") - || msg.starts_with("unsupported startup parameter in options: ") - { - format!("{msg}. Please use unpooled connection or remove this parameter from the startup package. More details: https://neon.tech/docs/connect/connection-errors#unsupported-startup-parameter") - } else { - msg.to_owned() - } - } - None => err.to_string(), - }, ConnectionError::WakeComputeError(err) => err.to_string_client(), ConnectionError::TooManyConnectionAttempts(_) => { "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned() @@ -80,10 +103,6 @@ impl UserFacingError for ConnectionError { impl ReportableError for ConnectionError { fn get_error_kind(&self) -> crate::error::ErrorKind { match self { - ConnectionError::Postgres(e) if e.as_db_error().is_some() => { - crate::error::ErrorKind::Postgres - } - ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute, ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute, ConnectionError::WakeComputeError(e) => e.get_error_kind(), ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(), @@ -206,6 +225,54 @@ impl AuthInfo { } } } + + pub async fn authenticate( + &self, + ctx: &RequestContext, + compute: &mut ComputeConnection, + user_info: ComputeUserInfo, + ) -> Result { + // client config with stubbed connect info. + // TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely, + // utilising pqproto.rs. + let mut tmp_config = postgres_client::Config::new(String::new(), 0); + // We have already established SSL if necessary. + tmp_config.ssl_mode(SslMode::Disable); + let tmp_config = self.enrich(tmp_config); + + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); + let connection = tmp_config.connect_raw(&mut compute.stream, NoTls).await?; + drop(pause); + + let RawConnection { + stream: _, + parameters, + delayed_notice, + process_id, + secret_key, + } = connection; + + tracing::Span::current().record("pid", tracing::field::display(process_id)); + + // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. + // Yet another reason to rework the connection establishing code. + let cancel_closure = CancelClosure::new( + compute.socket_addr, + RawCancelToken { + ssl_mode: compute.ssl_mode, + process_id, + secret_key, + }, + compute.hostname.to_string(), + user_info, + ); + + Ok(PostgresSettings { + params: parameters, + cancel_closure, + delayed_notice, + }) + } } impl ConnectInfo { @@ -268,51 +335,42 @@ impl ConnectInfo { pub type RustlsStream = >::Stream; pub type MaybeRustlsStream = MaybeTlsStream; -pub(crate) struct PostgresConnection { - /// Socket connected to a compute node. - pub(crate) stream: MaybeTlsStream, +// TODO(conrad): we don't need to parse these. +// These are just immediately forwarded back to the client. +// We could instead stream them out instead of reading them into memory. +pub struct PostgresSettings { /// PostgreSQL connection parameters. - pub(crate) params: std::collections::HashMap, + pub params: std::collections::HashMap, /// Query cancellation token. - pub(crate) cancel_closure: CancelClosure, - /// Labels for proxy's metrics. - pub(crate) aux: MetricsAuxInfo, + pub cancel_closure: CancelClosure, /// Notices received from compute after authenticating - pub(crate) delayed_notice: Vec, + pub delayed_notice: Vec, +} - pub(crate) guage: NumDbConnectionsGuard<'static>, +pub struct ComputeConnection { + /// Socket connected to a compute node. + pub stream: MaybeTlsStream, + /// Labels for proxy's metrics. + pub aux: MetricsAuxInfo, + pub hostname: Host, + pub ssl_mode: SslMode, + pub socket_addr: SocketAddr, + pub guage: NumDbConnectionsGuard<'static>, } impl ConnectInfo { /// Connect to a corresponding compute node. - pub(crate) async fn connect( + pub async fn connect( &self, ctx: &RequestContext, - aux: MetricsAuxInfo, - auth: &AuthInfo, + aux: &MetricsAuxInfo, config: &ComputeConfig, - user_info: ComputeUserInfo, - ) -> Result { - let mut tmp_config = auth.enrich(self.to_postgres_client_config()); - // we setup SSL early in `ConnectInfo::connect_raw`. - tmp_config.ssl_mode(SslMode::Disable); - + ) -> Result { let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (socket_addr, stream) = self.connect_raw(config).await?; - let connection = tmp_config.connect_raw(stream, NoTls).await?; drop(pause); - let RawConnection { - stream, - parameters, - delayed_notice, - process_id, - secret_key, - } = connection; - - tracing::Span::current().record("pid", tracing::field::display(process_id)); tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id)); - let MaybeTlsStream::Raw(stream) = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( @@ -324,25 +382,12 @@ impl ConnectInfo { ctx.get_testodrome_id().unwrap_or_default(), ); - // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. - // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new( - socket_addr, - RawCancelToken { - ssl_mode: self.ssl_mode, - process_id, - secret_key, - }, - self.host.to_string(), - user_info, - ); - - let connection = PostgresConnection { + let connection = ComputeConnection { stream, - params: parameters, - delayed_notice, - cancel_closure, - aux, + socket_addr, + hostname: self.host.clone(), + ssl_mode: self.ssl_mode, + aux: aux.clone(), guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), }; diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 89adfc9049..113a11beab 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -218,11 +218,9 @@ pub(crate) async fn handle_client( }; auth_info.set_startup_params(¶ms, true); - let node = connect_to_compute( + let mut node = connect_to_compute( ctx, &TcpMechanism { - user_info, - auth: auth_info, locks: &config.connect_compute_locks, }, &node_info, @@ -232,9 +230,14 @@ pub(crate) async fn handle_client( .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; + let pg_settings = auth_info + .authenticate(ctx, &mut node, user_info) + .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) + .await?; + let session = cancellation_handler.get_key(); - prepare_client_connection(&node, *session.key(), &mut stream); + prepare_client_connection(&pg_settings, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); @@ -244,7 +247,7 @@ pub(crate) async fn handle_client( .maintain_cancel_key( session_id, cancel, - &node.cancel_closure, + &pg_settings.cancel_closure, &config.connect_to_compute, ) .await; diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs index ed83e98bfe..a8c59dad0c 100644 --- a/proxy/src/control_plane/mod.rs +++ b/proxy/src/control_plane/mod.rs @@ -76,13 +76,9 @@ impl NodeInfo { pub(crate) async fn connect( &self, ctx: &RequestContext, - auth: &compute::AuthInfo, config: &ComputeConfig, - user_info: ComputeUserInfo, - ) -> Result { - self.conn_info - .connect(ctx, self.aux.clone(), auth, config, user_info) - .await + ) -> Result { + self.conn_info.connect(ctx, &self.aux, config).await } } diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 92ed84f50f..aa675a439e 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -2,8 +2,7 @@ use async_trait::async_trait; use tokio::time; use tracing::{debug, info, warn}; -use crate::auth::backend::ComputeUserInfo; -use crate::compute::{self, AuthInfo, COULD_NOT_CONNECT, PostgresConnection}; +use crate::compute::{self, COULD_NOT_CONNECT, ComputeConnection}; use crate::config::{ComputeConfig, RetryConfig}; use crate::context::RequestContext; use crate::control_plane::errors::WakeComputeError; @@ -50,15 +49,13 @@ pub(crate) trait ConnectMechanism { } pub(crate) struct TcpMechanism { - pub(crate) auth: AuthInfo, /// connect_to_compute concurrency lock pub(crate) locks: &'static ApiLocks, - pub(crate) user_info: ComputeUserInfo, } #[async_trait] impl ConnectMechanism for TcpMechanism { - type Connection = PostgresConnection; + type Connection = ComputeConnection; type ConnectError = compute::ConnectionError; type Error = compute::ConnectionError; @@ -71,13 +68,9 @@ impl ConnectMechanism for TcpMechanism { ctx: &RequestContext, node_info: &control_plane::CachedNodeInfo, config: &ComputeConfig, - ) -> Result { + ) -> Result { let permit = self.locks.get_permit(&node_info.conn_info.host).await?; - permit.release_result( - node_info - .connect(ctx, &self.auth, config, self.user_info.clone()) - .await, - ) + permit.release_result(node_info.connect(ctx, config).await) } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7da1b8d8fa..6947e07488 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -357,24 +357,28 @@ pub(crate) async fn handle_client( let res = connect_to_compute( ctx, &TcpMechanism { - user_info: creds.info.clone(), - auth: auth_info, locks: &config.connect_compute_locks, }, - &auth::Backend::ControlPlane(cplane, creds.info), + &auth::Backend::ControlPlane(cplane, creds.info.clone()), config.wake_compute_retry_config, &config.connect_to_compute, ) .await; - let node = match res { + let mut node = match res { Ok(node) => node, Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, }; + let pg_settings = auth_info.authenticate(ctx, &mut node, creds.info).await; + let pg_settings = match pg_settings { + Ok(pg_settings) => pg_settings, + Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + }; + let session = cancellation_handler.get_key(); - prepare_client_connection(&node, *session.key(), &mut stream); + prepare_client_connection(&pg_settings, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); @@ -384,7 +388,7 @@ pub(crate) async fn handle_client( .maintain_cancel_key( session_id, cancel, - &node.cancel_closure, + &pg_settings.cancel_closure, &config.connect_to_compute, ) .await; @@ -413,19 +417,19 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. pub(crate) fn prepare_client_connection( - node: &compute::PostgresConnection, + settings: &compute::PostgresSettings, cancel_key_data: CancelKeyData, stream: &mut PqStream, ) { // Forward all deferred notices to the client. - for notice in &node.delayed_notice { + for notice in &settings.delayed_notice { stream.write_raw(notice.as_bytes().len(), b'N', |buf| { buf.extend_from_slice(notice.as_bytes()); }); } // Forward all postgres connection params to the client. - for (name, value) in &node.params { + for (name, value) in &settings.params { stream.write_message(BeMessage::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 0f19944afa..e9eca95724 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -99,7 +99,6 @@ impl ShouldRetryWakeCompute for postgres_client::Error { impl CouldRetry for compute::ConnectionError { fn could_retry(&self) -> bool { match self { - compute::ConnectionError::Postgres(err) => err.could_retry(), compute::ConnectionError::TlsError(err) => err.could_retry(), compute::ConnectionError::WakeComputeError(err) => err.could_retry(), compute::ConnectionError::TooManyConnectionAttempts(_) => false, @@ -109,7 +108,6 @@ impl CouldRetry for compute::ConnectionError { impl ShouldRetryWakeCompute for compute::ConnectionError { fn should_retry_wake_compute(&self) -> bool { match self { - compute::ConnectionError::Postgres(err) => err.should_retry_wake_compute(), // the cache entry was not checked for validity compute::ConnectionError::TooManyConnectionAttempts(_) => false, _ => true, From 158d84ea30c65bcffdd6f9ae73aefedc9be912b3 Mon Sep 17 00:00:00 2001 From: Dmitry Savelev Date: Tue, 24 Jun 2025 17:36:36 +0200 Subject: [PATCH 09/50] Switch the billing metrics storage format to ndjson. (#12338) ## Problem The billing team wants to change the billing events pipeline and use a common events format in S3 buckets across different event producers. ## Summary of changes Change the events storage format for billing events from JSON to NDJSON. Resolves: https://github.com/neondatabase/cloud/issues/29994 --- proxy/src/usage_metrics.rs | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 115b958c54..c82c4865a7 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -399,7 +399,7 @@ async fn collect_metrics_iteration( fn create_remote_path_prefix(now: DateTime) -> String { format!( - "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z", + "year={year:04}/month={month:02}/day={day:02}/hour={hour:02}/{hour:02}:{minute:02}:{second:02}Z", year = now.year(), month = now.month(), day = now.day(), @@ -461,7 +461,7 @@ async fn upload_backup_events( real_now.second().into(), real_now.nanosecond(), )); - let path = format!("{path_prefix}_{id}.json.gz"); + let path = format!("{path_prefix}_{id}.ndjson.gz"); let remote_path = match RemotePath::from_string(&path) { Ok(remote_path) => remote_path, Err(e) => { @@ -471,9 +471,12 @@ async fn upload_backup_events( // TODO: This is async compression from Vec to Vec. Rewrite as byte stream. // Use sync compression in blocking threadpool. - let data = serde_json::to_vec(chunk).context("serialize metrics")?; let mut encoder = GzipEncoder::new(Vec::new()); - encoder.write_all(&data).await.context("compress metrics")?; + for event in chunk.events.iter() { + let data = serde_json::to_vec(event).context("serialize metrics")?; + encoder.write_all(&data).await.context("compress metrics")?; + encoder.write_all(b"\n").await.context("compress metrics")?; + } encoder.shutdown().await.context("compress metrics")?; let compressed_data: Bytes = encoder.get_ref().clone().into(); backoff::retry( @@ -499,7 +502,7 @@ async fn upload_backup_events( #[cfg(test)] mod tests { use std::fs; - use std::io::BufReader; + use std::io::{BufRead, BufReader}; use std::sync::{Arc, Mutex}; use anyhow::Error; @@ -673,11 +676,22 @@ mod tests { { let path = local_fs_path.join(&path_prefix).to_string(); if entry.path().to_str().unwrap().starts_with(&path) { - let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new( - BufReader::new(fs::File::open(entry.into_path()).unwrap()), - )) - .unwrap(); - stored_chunks.push(chunk); + let file = fs::File::open(entry.into_path()).unwrap(); + let decoder = flate2::bufread::GzDecoder::new(BufReader::new(file)); + let reader = BufReader::new(decoder); + + let mut events: Vec> = Vec::new(); + for line in reader.lines() { + let line = line.unwrap(); + let event: Event = serde_json::from_str(&line).unwrap(); + events.push(event); + } + + let report = Report { + events: Cow::Owned(events), + }; + + stored_chunks.push(report); } } storage_test_dir.close().ok(); From 6c6de6382ad22655e237847cee36b32ee0029063 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 24 Jun 2025 19:25:31 +0200 Subject: [PATCH 10/50] Use enum-typed PG versions (#12317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it possible for the compiler to validate that a match block matched all PostgreSQL versions we support. ## Problem We did not have a complete picture about which places we had to test against PG versions, and what format these versions were: The full PG version ID format (Major/minor/bugfix `MMmmbb`) as transfered in protocol messages, or only the Major release version (`MM`). This meant type confusion was rampant. With this change, it becomes easier to develop new version-dependent features, by making type and niche confusion impossible. ## Summary of changes Every use of `pg_version` is now typed as either `PgVersionId` (u32, valued in decimal `MMmmbb`) or PgMajorVersion (an enum, with a value for every major version we support, serialized and stored like a u32 with the value of that major version) --------- Co-authored-by: Arpad Müller --- Cargo.lock | 29 +++ Cargo.toml | 3 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/fast_import.rs | 10 +- compute_tools/src/extension_server.rs | 56 ++--- control_plane/src/bin/neon_local.rs | 14 +- control_plane/src/endpoint.rs | 9 +- control_plane/src/local_env.rs | 15 +- control_plane/src/pageserver.rs | 3 +- control_plane/src/storage_controller.rs | 18 +- libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/models.rs | 7 +- libs/postgres_ffi/Cargo.toml | 1 + libs/postgres_ffi/benches/waldecoder.rs | 3 +- libs/postgres_ffi/src/lib.rs | 41 ++-- libs/postgres_ffi/src/pg_constants_v14.rs | 4 + libs/postgres_ffi/src/pg_constants_v15.rs | 4 + libs/postgres_ffi/src/pg_constants_v16.rs | 4 + libs/postgres_ffi/src/pg_constants_v17.rs | 4 + libs/postgres_ffi/src/walrecord.rs | 23 +- libs/postgres_ffi/src/xlog_utils.rs | 4 +- .../wal_craft/src/bin/wal_craft.rs | 3 +- libs/postgres_ffi/wal_craft/src/lib.rs | 12 +- .../wal_craft/src/xlog_utils_test.rs | 2 +- libs/postgres_initdb/Cargo.toml | 1 + libs/postgres_initdb/src/lib.rs | 11 +- libs/postgres_versioninfo/Cargo.toml | 12 + libs/postgres_versioninfo/src/lib.rs | 175 ++++++++++++++ libs/safekeeper_api/Cargo.toml | 1 + libs/safekeeper_api/src/lib.rs | 4 +- libs/safekeeper_api/src/models.rs | 4 +- .../benches/bench_interpret_wal.rs | 4 +- libs/wal_decoder/src/decoder.rs | 214 +++++++++--------- libs/wal_decoder/src/serialized_batch.rs | 12 +- pageserver/benches/bench_walredo.rs | 11 +- pageserver/client/Cargo.toml | 1 + pageserver/client/src/mgmt_api.rs | 5 +- pageserver/src/basebackup.rs | 15 +- pageserver/src/config.rs | 15 +- pageserver/src/http/routes.rs | 3 +- pageserver/src/lib.rs | 3 +- pageserver/src/pgdatadir_mapping.rs | 10 +- pageserver/src/tenant.rs | 25 +- pageserver/src/tenant/metadata.rs | 15 +- .../tenant/remote_timeline_client/index.rs | 16 +- .../src/tenant/storage_layer/delta_layer.rs | 12 +- .../src/tenant/storage_layer/layer/tests.rs | 45 +++- pageserver/src/tenant/timeline.rs | 18 +- .../import_pgdata/importbucket_client.rs | 15 +- pageserver/src/walingest.rs | 22 +- pageserver/src/walredo.rs | 18 +- pageserver/src/walredo/process.rs | 6 +- safekeeper/Cargo.toml | 1 + safekeeper/src/control_file_upgrade.rs | 28 +-- safekeeper/src/safekeeper.rs | 14 +- safekeeper/src/send_interpreted_wal.rs | 18 +- safekeeper/src/send_wal.rs | 6 +- safekeeper/src/state.rs | 5 +- safekeeper/src/wal_storage.rs | 16 +- .../tests/walproposer_sim/safekeeper_disk.rs | 10 +- .../src/service/safekeeper_service.rs | 5 +- test_runner/regress/test_branching.py | 2 +- 62 files changed, 683 insertions(+), 386 deletions(-) create mode 100644 libs/postgres_versioninfo/Cargo.toml create mode 100644 libs/postgres_versioninfo/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 8cc51350ef..51724da061 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1318,6 +1318,7 @@ dependencies = [ "p256 0.13.2", "postgres", "postgres_initdb", + "postgres_versioninfo", "regex", "remote_storage", "reqwest", @@ -4406,6 +4407,7 @@ dependencies = [ "once_cell", "postgres_backend", "postgres_ffi_types", + "postgres_versioninfo", "rand 0.8.5", "remote_storage", "reqwest", @@ -4429,6 +4431,7 @@ dependencies = [ "futures", "http-utils", "pageserver_api", + "postgres_versioninfo", "reqwest", "serde", "thiserror 1.0.69", @@ -4897,6 +4900,7 @@ dependencies = [ "once_cell", "postgres", "postgres_ffi_types", + "postgres_versioninfo", "pprof", "regex", "serde", @@ -4919,11 +4923,23 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", + "postgres_versioninfo", "thiserror 1.0.69", "tokio", "workspace_hack", ] +[[package]] +name = "postgres_versioninfo" +version = "0.1.0" +dependencies = [ + "anyhow", + "serde", + "serde_repr", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "posthog_client_lite" version = "0.1.0" @@ -6115,6 +6131,7 @@ dependencies = [ "postgres-protocol", "postgres_backend", "postgres_ffi", + "postgres_versioninfo", "pprof", "pq_proto", "rand 0.8.5", @@ -6159,6 +6176,7 @@ dependencies = [ "const_format", "pageserver_api", "postgres_ffi", + "postgres_versioninfo", "pq_proto", "serde", "serde_json", @@ -6481,6 +6499,17 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "serde_spanned" version = "0.6.6" diff --git a/Cargo.toml b/Cargo.toml index 2a6acc132e..857bc5d5d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "libs/pageserver_api", "libs/postgres_ffi", "libs/postgres_ffi_types", + "libs/postgres_versioninfo", "libs/safekeeper_api", "libs/desim", "libs/neon-shmem", @@ -174,6 +175,7 @@ serde_json = "1" serde_path_to_error = "0.1" serde_with = { version = "3", features = [ "base64" ] } serde_assert = "0.5.0" +serde_repr = "0.1.20" sha2 = "0.10.2" signal-hook = "0.3" smallvec = "1.11" @@ -261,6 +263,7 @@ postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" } +postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" } postgres_initdb = { path = "./libs/postgres_initdb" } posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" } pq_proto = { version = "0.1", path = "./libs/pq_proto/" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f9da3ba700..a5879c4b7c 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -64,6 +64,7 @@ uuid.workspace = true walkdir.workspace = true x509-cert.workspace = true +postgres_versioninfo.workspace = true postgres_initdb.workspace = true compute_api.workspace = true utils.workspace = true diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 682525f6df..0eca9aba53 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -29,7 +29,7 @@ use anyhow::{Context, bail}; use aws_config::BehaviorVersion; use camino::{Utf8Path, Utf8PathBuf}; use clap::{Parser, Subcommand}; -use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version}; +use compute_tools::extension_server::get_pg_version; use nix::unistd::Pid; use std::ops::Not; use tracing::{Instrument, error, info, info_span, warn}; @@ -179,12 +179,8 @@ impl PostgresProcess { .await .context("create pgdata directory")?; - let pg_version = match get_pg_version(self.pgbin.as_ref()) { - PostgresMajorVersion::V14 => 14, - PostgresMajorVersion::V15 => 15, - PostgresMajorVersion::V16 => 16, - PostgresMajorVersion::V17 => 17, - }; + let pg_version = get_pg_version(self.pgbin.as_ref()); + postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs { superuser: initdb_user, locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded, diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index d8d5de34a5..47931d5f72 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -74,9 +74,11 @@ More specifically, here is an example ext_index.json use std::path::Path; use std::str; +use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; use anyhow::{Context, Result, bail}; use bytes::Bytes; use compute_api::spec::RemoteExtSpec; +use postgres_versioninfo::PgMajorVersion; use regex::Regex; use remote_storage::*; use reqwest::StatusCode; @@ -86,8 +88,6 @@ use tracing::log::warn; use url::Url; use zstd::stream::read::Decoder; -use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS}; - fn get_pg_config(argument: &str, pgbin: &str) -> String { // gives the result of `pg_config [argument]` // where argument is a flag like `--version` or `--sharedir` @@ -106,7 +106,7 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String { .to_string() } -pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion { +pub fn get_pg_version(pgbin: &str) -> PgMajorVersion { // pg_config --version returns a (platform specific) human readable string // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc. let human_version = get_pg_config("--version", pgbin); @@ -114,25 +114,11 @@ pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion { } pub fn get_pg_version_string(pgbin: &str) -> String { - match get_pg_version(pgbin) { - PostgresMajorVersion::V14 => "v14", - PostgresMajorVersion::V15 => "v15", - PostgresMajorVersion::V16 => "v16", - PostgresMajorVersion::V17 => "v17", - } - .to_owned() + get_pg_version(pgbin).v_str() } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum PostgresMajorVersion { - V14, - V15, - V16, - V17, -} - -fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { - use PostgresMajorVersion::*; +fn parse_pg_version(human_version: &str) -> PgMajorVersion { + use PgMajorVersion::*; // Normal releases have version strings like "PostgreSQL 15.4". But there // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version @@ -143,10 +129,10 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { .captures(human_version) { Some(captures) if captures.len() == 2 => match &captures["major"] { - "14" => return V14, - "15" => return V15, - "16" => return V16, - "17" => return V17, + "14" => return PG14, + "15" => return PG15, + "16" => return PG16, + "17" => return PG17, _ => {} }, _ => {} @@ -343,25 +329,25 @@ mod tests { #[test] fn test_parse_pg_version() { - use super::PostgresMajorVersion::*; - assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15); - assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15); + use postgres_versioninfo::PgMajorVersion::*; + assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15); + assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15); assert_eq!( parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"), - V15 + PG15 ); - assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14); - assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14); + assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14); + assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14); assert_eq!( parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"), - V14 + PG14 ); - assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16); - assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16); + assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16); + assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16); } #[test] diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index b2dd1a7077..c818d07fef 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -48,7 +48,7 @@ use postgres_connection::parse_host_port; use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId}; use safekeeper_api::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT, - DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, + DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId, }; use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR; use tokio::task::JoinSet; @@ -64,7 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1); const DEFAULT_BRANCH_NAME: &str = "main"; project_git_version!(GIT_VERSION); -const DEFAULT_PG_VERSION: u32 = 17; +const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/"; @@ -169,7 +169,7 @@ struct TenantCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version to use for the initial timeline")] - pg_version: u32, + pg_version: PgMajorVersion, #[clap( long, @@ -292,7 +292,7 @@ struct TimelineCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version")] - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(clap::Args)] @@ -324,7 +324,7 @@ struct TimelineImportCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version of the backup being imported")] - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(clap::Subcommand)] @@ -603,7 +603,7 @@ struct EndpointCreateCmdArgs { #[arg(default_value_t = DEFAULT_PG_VERSION)] #[clap(long, help = "Postgres version")] - pg_version: u32, + pg_version: PgMajorVersion, /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings. /// @@ -1295,7 +1295,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re }, new_members: None, }; - let pg_version = args.pg_version * 10000; + let pg_version = PgVersionId::from(args.pg_version); let req = safekeeper_api::models::TimelineCreateRequest { tenant_id, timeline_id, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index dab53b0f27..e3faa082db 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -67,6 +67,7 @@ use nix::sys::signal::{Signal, kill}; use pageserver_api::shard::ShardStripeSize; use pem::Pem; use reqwest::header::CONTENT_TYPE; +use safekeeper_api::PgMajorVersion; use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; @@ -89,7 +90,7 @@ pub struct EndpointConf { pg_port: u16, external_http_port: u16, internal_http_port: u16, - pg_version: u32, + pg_version: PgMajorVersion, grpc: bool, skip_pg_catalog_updates: bool, reconfigure_concurrency: usize, @@ -192,7 +193,7 @@ impl ComputeControlPlane { pg_port: Option, external_http_port: Option, internal_http_port: Option, - pg_version: u32, + pg_version: PgMajorVersion, mode: ComputeMode, grpc: bool, skip_pg_catalog_updates: bool, @@ -312,7 +313,7 @@ pub struct Endpoint { pub internal_http_address: SocketAddr, // postgres major version in the format: 14, 15, etc. - pg_version: u32, + pg_version: PgMajorVersion, // These are not part of the endpoint as such, but the environment // the endpoint runs in. @@ -557,7 +558,7 @@ impl Endpoint { conf.append("hot_standby", "on"); // prefetching of blocks referenced in WAL doesn't make sense for us // Neon hot standby ignores pages that are not in the shared_buffers - if self.pg_version >= 15 { + if self.pg_version >= PgMajorVersion::PG15 { conf.append("recovery_prefetch", "off"); } } diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 34465b4d5d..370921a85c 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,6 +15,7 @@ use clap::ValueEnum; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Certificate, Url}; +use safekeeper_api::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; @@ -424,25 +425,21 @@ impl LocalEnv { self.pg_distrib_dir.clone() } - pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(path.join(pg_version.v_str())) } - pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result { + pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "bin") } - pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { self.pg_dir(pg_version, "lib") } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 7fa00a6730..3f66960edd 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -22,6 +22,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; use postgres_connection::{PgConnectionConfig, parse_host_port}; +use safekeeper_api::PgMajorVersion; use utils::auth::{Claims, Scope}; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; @@ -607,7 +608,7 @@ impl PageServerNode { timeline_id: TimelineId, base: (Lsn, PathBuf), pg_wal: Option<(Lsn, PathBuf)>, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Init base reader let (start_lsn, base_tarfile_path) = base; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index f2ac5bb2dd..218b17d88d 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -6,6 +6,8 @@ use std::str::FromStr; use std::sync::OnceLock; use std::time::{Duration, Instant}; +use crate::background_process; +use crate::local_env::{LocalEnv, NeonStorageControllerConf}; use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; @@ -22,6 +24,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Method, Response}; +use safekeeper_api::PgMajorVersion; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -31,9 +34,6 @@ use utils::auth::{Claims, Scope, encode_from_key_file}; use utils::id::{NodeId, TenantId}; use whoami::username; -use crate::background_process; -use crate::local_env::{LocalEnv, NeonStorageControllerConf}; - pub struct StorageController { env: LocalEnv, private_key: Option, @@ -48,7 +48,7 @@ pub struct StorageController { const COMMAND: &str = "storage_controller"; -const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16; +const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16; const DB_NAME: &str = "storage_controller"; @@ -184,9 +184,15 @@ impl StorageController { /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { - let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14]; + const PREFER_VERSIONS: [PgMajorVersion; 5] = [ + STORAGE_CONTROLLER_POSTGRES_VERSION, + PgMajorVersion::PG16, + PgMajorVersion::PG15, + PgMajorVersion::PG14, + PgMajorVersion::PG17, + ]; - for v in prefer_versions { + for v in PREFER_VERSIONS { let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); if tokio::fs::try_exists(&path).await? { return Ok(path); diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 5a9a74b93d..a34e065788 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -18,6 +18,7 @@ bytes.workspace = true byteorder.workspace = true utils.workspace = true postgres_ffi_types.workspace = true +postgres_versioninfo.workspace = true enum-map.workspace = true strum.workspace = true strum_macros.workspace = true diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ee6725efbe..82a3ac0eb4 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -11,6 +11,7 @@ use std::time::{Duration, SystemTime}; #[cfg(feature = "testing")] use camino::Utf8PathBuf; +use postgres_versioninfo::PgMajorVersion; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde_with::serde_as; pub use utilization::PageserverUtilization; @@ -398,7 +399,7 @@ pub enum TimelineCreateRequestMode { // inherits the ancestor's pg_version. Earlier code wasn't // using a flattened enum, so, it was an accepted field, and // we continue to accept it by having it here. - pg_version: Option, + pg_version: Option, #[serde(default, skip_serializing_if = "std::ops::Not::not")] read_only: bool, }, @@ -410,7 +411,7 @@ pub enum TimelineCreateRequestMode { Bootstrap { #[serde(default)] existing_initdb_timeline_id: Option, - pg_version: Option, + pg_version: Option, }, } @@ -1573,7 +1574,7 @@ pub struct TimelineInfo { pub last_received_msg_lsn: Option, /// the timestamp (in microseconds) of the last received message pub last_received_msg_ts: Option, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub state: TimelineState, diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index 67adfdd3c3..d4fec6cbe9 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -19,6 +19,7 @@ serde.workspace = true postgres_ffi_types.workspace = true utils.workspace = true tracing.workspace = true +postgres_versioninfo.workspace = true [dev-dependencies] env_logger.workspace = true diff --git a/libs/postgres_ffi/benches/waldecoder.rs b/libs/postgres_ffi/benches/waldecoder.rs index 2e1d62e452..b2a884c7db 100644 --- a/libs/postgres_ffi/benches/waldecoder.rs +++ b/libs/postgres_ffi/benches/waldecoder.rs @@ -4,6 +4,7 @@ use criterion::{Bencher, Criterion, criterion_group, criterion_main}; use postgres_ffi::v17::wal_generator::LogicalMessageGenerator; use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_versioninfo::PgMajorVersion; use pprof::criterion::{Output, PProfProfiler}; use utils::lsn::Lsn; @@ -32,7 +33,7 @@ fn bench_complete_record(c: &mut Criterion) { let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX); let value = vec![1; value_size]; - let mut decoder = WalStreamDecoder::new(Lsn(0), 170000); + let mut decoder = WalStreamDecoder::new(Lsn(0), PgMajorVersion::PG17); let msg = LogicalMessageGenerator::new(PREFIX, &value) .next() .unwrap() diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 05d8de4c7a..9297ac46c9 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -14,6 +14,8 @@ use bytes::Bytes; use utils::bin_ser::SerializeError; use utils::lsn::Lsn; +pub use postgres_versioninfo::PgMajorVersion; + macro_rules! postgres_ffi { ($version:ident) => { #[path = "."] @@ -91,21 +93,22 @@ macro_rules! dispatch_pgversion { $version => $code, default = $invalid_pgver_handling, pgversions = [ - 14 : v14, - 15 : v15, - 16 : v16, - 17 : v17, + $crate::PgMajorVersion::PG14 => v14, + $crate::PgMajorVersion::PG15 => v15, + $crate::PgMajorVersion::PG16 => v16, + $crate::PgMajorVersion::PG17 => v17, ] ) }; ($pgversion:expr => $code:expr, default = $default:expr, - pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => { - match ($pgversion) { + pgversions = [$($sv:pat => $vsv:ident),+ $(,)?]) => { + match ($pgversion.clone().into()) { $($sv => { use $crate::$vsv as pgv; $code },)+ + #[allow(unreachable_patterns)] _ => { $default } @@ -179,9 +182,9 @@ macro_rules! enum_pgversion { $($variant ( $crate::$md::$t )),+ } impl self::$name { - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { - pgv::bindings::PG_MAJORVERSION_NUM + pgv::bindings::MY_PGVERSION }) } } @@ -195,15 +198,15 @@ macro_rules! enum_pgversion { }; {name = $name:ident, path = $p:ident, - typ = $t:ident, + $(typ = $t:ident,)? pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => { pub enum $name { - $($variant ($crate::$md::$p::$t)),+ + $($variant $(($crate::$md::$p::$t))?),+ } impl $name { - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { enum_pgversion_dispatch!(self, $name, _ign, { - pgv::bindings::PG_MAJORVERSION_NUM + pgv::bindings::MY_PGVERSION }) } } @@ -249,22 +252,21 @@ pub use v14::xlog_utils::{ try_from_pg_timestamp, }; -pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool { +pub fn bkpimage_is_compressed(bimg_info: u8, version: PgMajorVersion) -> bool { dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info)) } pub fn generate_wal_segment( segno: u64, system_id: u64, - pg_version: u32, + pg_version: PgMajorVersion, lsn: Lsn, ) -> Result { assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE)); dispatch_pgversion!( pg_version, - pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn), - Err(SerializeError::BadInput) + pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn) ) } @@ -272,7 +274,7 @@ pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, @@ -352,6 +354,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { pub mod waldecoder { use std::num::NonZeroU32; + use crate::PgMajorVersion; use bytes::{Buf, Bytes, BytesMut}; use thiserror::Error; use utils::lsn::Lsn; @@ -369,7 +372,7 @@ pub mod waldecoder { pub struct WalStreamDecoder { pub lsn: Lsn, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub inputbuf: BytesMut, pub state: State, } @@ -382,7 +385,7 @@ pub mod waldecoder { } impl WalStreamDecoder { - pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder { + pub fn new(lsn: Lsn, pg_version: PgMajorVersion) -> WalStreamDecoder { WalStreamDecoder { lsn, pg_version, diff --git a/libs/postgres_ffi/src/pg_constants_v14.rs b/libs/postgres_ffi/src/pg_constants_v14.rs index fe01a5df7c..fd393995db 100644 --- a/libs/postgres_ffi/src/pg_constants_v14.rs +++ b/libs/postgres_ffi/src/pg_constants_v14.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG14; + pub const XLOG_DBASE_CREATE: u8 = 0x00; pub const XLOG_DBASE_DROP: u8 = 0x10; diff --git a/libs/postgres_ffi/src/pg_constants_v15.rs b/libs/postgres_ffi/src/pg_constants_v15.rs index 3cd1b7aec5..6c1e2c13de 100644 --- a/libs/postgres_ffi/src/pg_constants_v15.rs +++ b/libs/postgres_ffi/src/pg_constants_v15.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG15; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/pg_constants_v16.rs b/libs/postgres_ffi/src/pg_constants_v16.rs index 31bd5b68fd..d84db502f3 100644 --- a/libs/postgres_ffi/src/pg_constants_v16.rs +++ b/libs/postgres_ffi/src/pg_constants_v16.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG16; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/pg_constants_v17.rs b/libs/postgres_ffi/src/pg_constants_v17.rs index 2132938680..14d4b3d42f 100644 --- a/libs/postgres_ffi/src/pg_constants_v17.rs +++ b/libs/postgres_ffi/src/pg_constants_v17.rs @@ -1,3 +1,7 @@ +use crate::PgMajorVersion; + +pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG17; + pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8; pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00; diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs index c0ae88363e..d593123dc0 100644 --- a/libs/postgres_ffi/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -9,8 +9,8 @@ use utils::bin_ser::DeserializeError; use utils::lsn::Lsn; use crate::{ - BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, - TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, + BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion, + RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants, }; #[repr(C)] @@ -199,20 +199,17 @@ impl DecodedWALRecord { /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations /// by reading other existing relations' data blocks. This is more complex to apply than new-style database /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. - pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + pub fn is_dbase_create_copy(&self, pg_version: PgMajorVersion) -> bool { if self.xl_rmid == pg_constants::RM_DBASE_ID { let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; match pg_version { - 14 => { + PgMajorVersion::PG14 => { // Postgres 14 database creations are always the legacy kind info == crate::v14::bindings::XLOG_DBASE_CREATE } - 15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, - _ => { - panic!("Unsupported postgres version {pg_version}") - } + PgMajorVersion::PG15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + PgMajorVersion::PG16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + PgMajorVersion::PG17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, } } else { false @@ -248,7 +245,7 @@ impl DecodedWALRecord { pub fn decode_wal_record( record: Bytes, decoded: &mut DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { let mut rnode_spcnode: u32 = 0; let mut rnode_dbnode: u32 = 0; @@ -1106,9 +1103,9 @@ pub struct XlClogTruncate { } impl XlClogTruncate { - pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate { + pub fn decode(buf: &mut Bytes, pg_version: PgMajorVersion) -> XlClogTruncate { XlClogTruncate { - pageno: if pg_version < 17 { + pageno: if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 14fb1f2a1f..f7b6296053 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -11,9 +11,9 @@ use super::super::waldecoder::WalStreamDecoder; use super::bindings::{ CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC, + MY_PGVERSION }; use super::wal_generator::LogicalMessageGenerator; -use super::PG_MAJORVERSION; use crate::pg_constants; use crate::PG_TLI; use crate::{uint32, uint64, Oid}; @@ -233,7 +233,7 @@ pub fn find_end_of_wal( let mut result = start_lsn; let mut curr_lsn = start_lsn; let mut buf = [0u8; XLOG_BLCKSZ]; - let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + let pg_version = MY_PGVERSION; debug!("find_end_of_wal PG_VERSION: {}", pg_version); let mut decoder = WalStreamDecoder::new(start_lsn, pg_version); diff --git a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs index 6151ce34ac..44bc4dfa95 100644 --- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs +++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs @@ -4,6 +4,7 @@ use std::str::FromStr; use anyhow::*; use clap::{Arg, ArgMatches, Command, value_parser}; use postgres::Client; +use postgres_ffi::PgMajorVersion; use wal_craft::*; fn main() -> Result<()> { @@ -48,7 +49,7 @@ fn main() -> Result<()> { Some(("with-initdb", arg_matches)) => { let cfg = Conf { pg_version: *arg_matches - .get_one::("pg-version") + .get_one::("pg-version") .context("'pg-version' is required")?, pg_distrib_dir: arg_matches .get_one::("pg-distrib-dir") diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs index ca9530faef..ef9e854297 100644 --- a/libs/postgres_ffi/wal_craft/src/lib.rs +++ b/libs/postgres_ffi/wal_craft/src/lib.rs @@ -9,8 +9,8 @@ use log::*; use postgres::Client; use postgres::types::PgLsn; use postgres_ffi::{ - WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, - XLOG_SIZE_OF_XLOG_SHORT_PHD, + PgMajorVersion, WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, + XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; macro_rules! xlog_utils_test { @@ -29,7 +29,7 @@ macro_rules! xlog_utils_test { postgres_ffi::for_all_postgres_versions! { xlog_utils_test } pub struct Conf { - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub pg_distrib_dir: PathBuf, pub datadir: PathBuf, } @@ -52,11 +52,7 @@ impl Conf { pub fn pg_distrib_dir(&self) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match self.pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))), - _ => bail!("Unsupported postgres version: {}", self.pg_version), - } + Ok(path.join(self.pg_version.v_str())) } fn pg_bin_dir(&self) -> anyhow::Result { diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 94371a35b5..366aa7dbef 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -24,7 +24,7 @@ fn init_logging() { fn test_end_of_wal(test_name: &str) { use crate::*; - let pg_version = PG_MAJORVERSION[1..3].parse::().unwrap(); + let pg_version = MY_PGVERSION; // Craft some WAL let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) diff --git a/libs/postgres_initdb/Cargo.toml b/libs/postgres_initdb/Cargo.toml index 1605279bce..5b3b0cd936 100644 --- a/libs/postgres_initdb/Cargo.toml +++ b/libs/postgres_initdb/Cargo.toml @@ -9,4 +9,5 @@ anyhow.workspace = true tokio.workspace = true camino.workspace = true thiserror.workspace = true +postgres_versioninfo.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_initdb/src/lib.rs b/libs/postgres_initdb/src/lib.rs index 4b4a597f73..a0c6ebef81 100644 --- a/libs/postgres_initdb/src/lib.rs +++ b/libs/postgres_initdb/src/lib.rs @@ -7,12 +7,13 @@ use std::fmt; use camino::Utf8Path; +use postgres_versioninfo::PgMajorVersion; pub struct RunInitdbArgs<'a> { pub superuser: &'a str, pub locale: &'a str, pub initdb_bin: &'a Utf8Path, - pub pg_version: u32, + pub pg_version: PgMajorVersion, pub library_search_path: &'a Utf8Path, pub pgdata: &'a Utf8Path, } @@ -79,12 +80,16 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> { .stderr(std::process::Stdio::piped()); // Before version 14, only the libc provide was available. - if pg_version > 14 { + if pg_version > PgMajorVersion::PG14 { // Version 17 brought with it a builtin locale provider which only provides // C and C.UTF-8. While being safer for collation purposes since it is // guaranteed to be consistent throughout a major release, it is also more // performant. - let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + let locale_provider = if pg_version >= PgMajorVersion::PG17 { + "builtin" + } else { + "libc" + }; initdb_command.args(["--locale-provider", locale_provider]); } diff --git a/libs/postgres_versioninfo/Cargo.toml b/libs/postgres_versioninfo/Cargo.toml new file mode 100644 index 0000000000..cc59f9698d --- /dev/null +++ b/libs/postgres_versioninfo/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "postgres_versioninfo" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +thiserror.workspace = true +serde.workspace = true +serde_repr.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/postgres_versioninfo/src/lib.rs b/libs/postgres_versioninfo/src/lib.rs new file mode 100644 index 0000000000..286507b654 --- /dev/null +++ b/libs/postgres_versioninfo/src/lib.rs @@ -0,0 +1,175 @@ +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use serde_repr::{Deserialize_repr, Serialize_repr}; +use std::fmt::{Display, Formatter}; +use std::str::FromStr; + +/// An enum with one variant for each major version of PostgreSQL that we support. +/// +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Deserialize_repr, Serialize_repr)] +#[repr(u32)] +pub enum PgMajorVersion { + PG14 = 14, + PG15 = 15, + PG16 = 16, + PG17 = 17, + // !!! When you add a new PgMajorVersion, don't forget to update PgMajorVersion::ALL +} + +/// A full PostgreSQL version ID, in MMmmbb numerical format (Major/minor/bugfix) +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[repr(transparent)] +pub struct PgVersionId(u32); + +impl PgVersionId { + pub const UNKNOWN: PgVersionId = PgVersionId(0); + + pub fn from_full_pg_version(version: u32) -> PgVersionId { + match version { + 0 => PgVersionId(version), // unknown version + 140000..180000 => PgVersionId(version), + _ => panic!("Invalid full PostgreSQL version ID {version}"), + } + } +} + +impl Display for PgVersionId { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + u32::fmt(&self.0, f) + } +} + +impl Serialize for PgVersionId { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + u32::serialize(&self.0, serializer) + } +} + +impl<'de> Deserialize<'de> for PgVersionId { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + u32::deserialize(deserializer).map(PgVersionId) + } + + fn deserialize_in_place(deserializer: D, place: &mut Self) -> Result<(), D::Error> + where + D: Deserializer<'de>, + { + u32::deserialize_in_place(deserializer, &mut place.0) + } +} + +impl PgMajorVersion { + /// Get the numerical representation of the represented Major Version + pub const fn major_version_num(&self) -> u32 { + match self { + PgMajorVersion::PG14 => 14, + PgMajorVersion::PG15 => 15, + PgMajorVersion::PG16 => 16, + PgMajorVersion::PG17 => 17, + } + } + + /// Get the contents of this version's PG_VERSION file. + /// + /// The PG_VERSION file is used to determine the PostgreSQL version that currently + /// owns the data in a PostgreSQL data directory. + pub fn versionfile_string(&self) -> &'static str { + match self { + PgMajorVersion::PG14 => "14", + PgMajorVersion::PG15 => "15", + PgMajorVersion::PG16 => "16\x0A", + PgMajorVersion::PG17 => "17\x0A", + } + } + + /// Get the v{version} string of this major PostgreSQL version. + /// + /// Because this was hand-coded in various places, this was moved into a shared + /// implementation. + pub fn v_str(&self) -> String { + match self { + PgMajorVersion::PG14 => "v14", + PgMajorVersion::PG15 => "v15", + PgMajorVersion::PG16 => "v16", + PgMajorVersion::PG17 => "v17", + } + .to_string() + } + + /// All currently supported major versions of PostgreSQL. + pub const ALL: &'static [PgMajorVersion] = &[ + PgMajorVersion::PG14, + PgMajorVersion::PG15, + PgMajorVersion::PG16, + PgMajorVersion::PG17, + ]; +} + +impl Display for PgMajorVersion { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + PgMajorVersion::PG14 => "PgMajorVersion::PG14", + PgMajorVersion::PG15 => "PgMajorVersion::PG15", + PgMajorVersion::PG16 => "PgMajorVersion::PG16", + PgMajorVersion::PG17 => "PgMajorVersion::PG17", + }) + } +} + +#[derive(Debug, thiserror::Error)] +#[allow(dead_code)] +pub struct InvalidPgVersion(u32); + +impl Display for InvalidPgVersion { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "InvalidPgVersion({})", self.0) + } +} + +impl TryFrom for PgMajorVersion { + type Error = InvalidPgVersion; + + fn try_from(value: PgVersionId) -> Result { + Ok(match value.0 / 10000 { + 14 => PgMajorVersion::PG14, + 15 => PgMajorVersion::PG15, + 16 => PgMajorVersion::PG16, + 17 => PgMajorVersion::PG17, + _ => return Err(InvalidPgVersion(value.0)), + }) + } +} + +impl From for PgVersionId { + fn from(value: PgMajorVersion) -> Self { + PgVersionId((value as u32) * 10000) + } +} + +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub struct PgMajorVersionParseError(String); + +impl Display for PgMajorVersionParseError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PgMajorVersionParseError({})", self.0) + } +} + +impl FromStr for PgMajorVersion { + type Err = PgMajorVersionParseError; + + fn from_str(s: &str) -> Result { + Ok(match s { + "14" => PgMajorVersion::PG14, + "15" => PgMajorVersion::PG15, + "16" => PgMajorVersion::PG16, + "17" => PgMajorVersion::PG17, + _ => return Err(PgMajorVersionParseError(s.to_string())), + }) + } +} diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml index d9d080e8fe..928e583b0b 100644 --- a/libs/safekeeper_api/Cargo.toml +++ b/libs/safekeeper_api/Cargo.toml @@ -10,6 +10,7 @@ const_format.workspace = true serde.workspace = true serde_json.workspace = true postgres_ffi.workspace = true +postgres_versioninfo.workspace = true pq_proto.workspace = true tokio.workspace = true utils.workspace = true diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs index fa86523ad7..ba0bfee971 100644 --- a/libs/safekeeper_api/src/lib.rs +++ b/libs/safekeeper_api/src/lib.rs @@ -8,6 +8,8 @@ pub mod membership; /// Public API types pub mod models; +pub use postgres_versioninfo::{PgMajorVersion, PgVersionId}; + /// Consensus logical timestamp. Note: it is a part of sk control file. pub type Term = u64; /// With this term timeline is created initially. It @@ -20,7 +22,7 @@ pub const INITIAL_TERM: Term = 0; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfo { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub wal_seg_size: u32, } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index fd05f6fda3..5c1ee41f7b 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -4,6 +4,7 @@ use std::net::SocketAddr; use pageserver_api::shard::ShardIdentity; use postgres_ffi::TimestampTz; +use postgres_versioninfo::PgVersionId; use serde::{Deserialize, Serialize}; use tokio::time::Instant; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; @@ -23,8 +24,7 @@ pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, - /// In the PG_VERSION_NUM macro format, like 140017. - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: Option, // By default WAL_SEGMENT_SIZE pub wal_seg_size: Option, diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs index ff860a92e2..e3956eca05 100644 --- a/libs/wal_decoder/benches/bench_interpret_wal.rs +++ b/libs/wal_decoder/benches/bench_interpret_wal.rs @@ -10,7 +10,7 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; use postgres_ffi::waldecoder::WalStreamDecoder; -use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, WAL_SEGMENT_SIZE}; use pprof::criterion::{Output, PProfProfiler}; use remote_storage::{ DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind, @@ -115,7 +115,7 @@ struct BenchmarkData { #[derive(Deserialize)] struct BenchmarkMetadata { - pg_version: u32, + pg_version: PgMajorVersion, start_lsn: Lsn, } diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index 9980a1f369..0843eb35bf 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -7,8 +7,8 @@ use bytes::{Buf, Bytes}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::pg_constants; use postgres_ffi::walrecord::*; +use postgres_ffi::{PgMajorVersion, pg_constants}; use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM; use utils::lsn::Lsn; @@ -24,7 +24,7 @@ impl InterpretedWalRecord { buf: Bytes, shards: &[ShardIdentity], next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let mut decoded = DecodedWALRecord::default(); decode_wal_record(buf, &mut decoded, pg_version)?; @@ -78,7 +78,7 @@ impl MetadataRecord { decoded: &DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // Note: this doesn't actually copy the bytes since // the [`Bytes`] type implements it via a level of indirection. @@ -193,7 +193,7 @@ impl MetadataRecord { fn decode_heapam_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. @@ -205,7 +205,7 @@ impl MetadataRecord { let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS; match pg_version { - 14 => { + PgMajorVersion::PG14 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -272,7 +272,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 15 => { + PgMajorVersion::PG15 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -339,7 +339,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 16 => { + PgMajorVersion::PG16 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -406,7 +406,7 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - 17 => { + PgMajorVersion::PG17 => { if decoded.xl_rmid == pg_constants::RM_HEAP_ID { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; @@ -473,7 +473,6 @@ impl MetadataRecord { anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid); } } - _ => {} } if new_heap_blkno.is_some() || old_heap_blkno.is_some() { @@ -500,7 +499,7 @@ impl MetadataRecord { fn decode_neonmgr_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // Handle VM bit updates that are implicitly part of heap records. @@ -514,7 +513,7 @@ impl MetadataRecord { assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID); match pg_version { - 16 | 17 => { + PgMajorVersion::PG16 | PgMajorVersion::PG17 => { let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK; match info { @@ -574,7 +573,7 @@ impl MetadataRecord { info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info), } } - _ => anyhow::bail!( + PgMajorVersion::PG15 | PgMajorVersion::PG14 => anyhow::bail!( "Neon RMGR has no known compatibility with PostgreSQL version {}", pg_version ), @@ -629,116 +628,121 @@ impl MetadataRecord { fn decode_dbase_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { // TODO: Refactor this to avoid the duplication between postgres versions. let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID"); - if pg_version == 14 { - if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { - let createdb = XlCreateDatabase::decode(buf); - tracing::debug!("XLOG_DBASE_CREATE v14"); + match pg_version { + PgMajorVersion::PG14 => { + if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE { + let createdb = XlCreateDatabase::decode(buf); + tracing::debug!("XLOG_DBASE_CREATE v14"); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); + return Ok(Some(record)); + } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 15 { - if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG15 => { + if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 16 { - if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG16 => { + if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } - } else if pg_version == 17 { - if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { - tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); - } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { - // The XLOG record was renamed between v14 and v15, - // but the record format is the same. - // So we can reuse XlCreateDatabase here. - tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); + PgMajorVersion::PG17 => { + if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG { + tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop"); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY { + // The XLOG record was renamed between v14 and v15, + // but the record format is the same. + // So we can reuse XlCreateDatabase here. + tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY"); - let createdb = XlCreateDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { - db_id: createdb.db_id, - tablespace_id: createdb.tablespace_id, - src_db_id: createdb.src_db_id, - src_tablespace_id: createdb.src_tablespace_id, - })); + let createdb = XlCreateDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate { + db_id: createdb.db_id, + tablespace_id: createdb.tablespace_id, + src_db_id: createdb.src_db_id, + src_tablespace_id: createdb.src_tablespace_id, + })); - return Ok(Some(record)); - } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { - let dropdb = XlDropDatabase::decode(buf); - let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { - db_id: dropdb.db_id, - tablespace_ids: dropdb.tablespace_ids, - })); + return Ok(Some(record)); + } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP { + let dropdb = XlDropDatabase::decode(buf); + let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop { + db_id: dropdb.db_id, + tablespace_ids: dropdb.tablespace_ids, + })); - return Ok(Some(record)); + return Ok(Some(record)); + } } } @@ -748,12 +752,12 @@ impl MetadataRecord { fn decode_clog_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK; if info == pg_constants::CLOG_ZEROPAGE { - let pageno = if pg_version < 17 { + let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 @@ -765,7 +769,7 @@ impl MetadataRecord { ClogZeroPage { segno, rpageno }, )))) } else { - assert!(info == pg_constants::CLOG_TRUNCATE); + assert_eq!(info, pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(buf, pg_version); Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate( @@ -838,14 +842,14 @@ impl MetadataRecord { fn decode_multixact_record( buf: &mut Bytes, decoded: &DecodedWALRecord, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { - let pageno = if pg_version < 17 { + let pageno = if pg_version < PgMajorVersion::PG17 { buf.get_u32_le() } else { buf.get_u64_le() as u32 diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs index 4123f7d0ac..ab38ff3d73 100644 --- a/libs/wal_decoder/src/serialized_batch.rs +++ b/libs/wal_decoder/src/serialized_batch.rs @@ -13,7 +13,7 @@ use pageserver_api::keyspace::KeySpace; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord}; -use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants}; +use postgres_ffi::{BLCKSZ, PgMajorVersion, page_is_new, page_set_lsn, pg_constants}; use serde::{Deserialize, Serialize}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; @@ -139,7 +139,7 @@ impl SerializedValueBatch { decoded: DecodedWALRecord, shard_records: &mut HashMap, next_record_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result<()> { // First determine how big the buffers need to be and allocate it up-front. // This duplicates some of the work below, but it's empirically much faster. @@ -267,7 +267,7 @@ impl SerializedValueBatch { fn estimate_buffer_size( decoded: &DecodedWALRecord, shard: &ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, ) -> usize { let mut estimate: usize = 0; @@ -303,7 +303,11 @@ impl SerializedValueBatch { estimate } - fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool { + fn block_is_image( + decoded: &DecodedWALRecord, + blk: &DecodedBkpBlock, + pg_version: PgMajorVersion, + ) -> bool { blk.apply_image && blk.has_image && decoded.xl_rmid == pg_constants::RM_XLOG_ID diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 36d0d9c974..efb970f705 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -68,6 +68,7 @@ use pageserver::config::PageServerConf; use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; +use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::sync::Barrier; use tokio::task::JoinSet; use utils::id::TenantId; @@ -94,7 +95,7 @@ fn bench(c: &mut Criterion) { // // benchmark the protocol implementation // - let pg_version = 14; + let pg_version = PgMajorVersion::PG14; bench_group!( "ping", Arc::new(move |mgr: Arc| async move { @@ -107,7 +108,7 @@ fn bench(c: &mut Criterion) { let make_redo_work = |req: &'static Request| { Arc::new(move |mgr: Arc| async move { let page = req.execute(&mgr).await.unwrap(); - assert_eq!(page.remaining(), 8192); + assert_eq!(page.remaining(), BLCKSZ as usize); }) }; bench_group!("short", { @@ -208,7 +209,7 @@ struct Request { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, } impl Request { @@ -267,7 +268,7 @@ impl Request { pg_record(false, b"\xbc\0\0\0\0\0\0\0h?m\x01\0\0\0\0p\n\0\09\x08\xa3\xea\0 \x8c\0\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x02\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\x05\0\0\0\0@zD\x05\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\x02\0"), ), ], - pg_version: 14, + pg_version: PgMajorVersion::PG14, } } @@ -516,7 +517,7 @@ impl Request { (lsn!("0/16B8000"), pg_record(false, b"C\0\0\0\0\x04\0\0p\x7fk\x01\0\0\0\0\0\n\0\0\\\xc4:?\0 \x12\0\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\xff\x03\x01\0\0\x08\x01\0\0\0\x18\0\xe1\0\0\0\0\0\0\0\xe2\0\0")), (lsn!("0/16CBD68"), pg_record(false, b"@ \0\0\0\0\0\0\xc0|l\x01\0\0\0\0@\t\0\0\xdf\xb0\x1a`\0\x12\0\0\0 \0\0\x04\x7f\x06\0\0\xd22\0\0\0@\0\0\0\0\0\0\x01\x80\0\0\0\0\0\0\xff\x05\0\0\0\0\0\0\0\0\0\0\0\0\x18\0\0 \0 \x04 \0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x04\0\0\x01")), ], - pg_version: 14, + pg_version: PgMajorVersion::PG14, } } } diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 970a437a42..47e2a6ddae 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -18,6 +18,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" } tokio-postgres.workspace = true tokio-stream.workspace = true tokio.workspace = true +postgres_versioninfo.workspace = true futures.workspace = true tokio-util.workspace = true anyhow.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 8b091684eb..3919a6e788 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,6 +7,7 @@ use detach_ancestor::AncestorDetached; use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; +use postgres_versioninfo::PgMajorVersion; pub use reqwest::Body as ReqwestBody; use reqwest::{IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; @@ -745,9 +746,11 @@ impl Client { timeline_id: TimelineId, base_lsn: Lsn, end_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, basebackup_tarball: ReqwestBody, ) -> Result<()> { + let pg_version = pg_version.major_version_num(); + let uri = format!( "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", self.mgmt_api_endpoint, diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index c4aaff58a1..115f0d9ebc 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -20,7 +20,8 @@ use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES}; use postgres_ffi::{ - BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants, + BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, + dispatch_pgversion, pg_constants, }; use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; @@ -619,10 +620,7 @@ where }; if spcnode == GLOBALTABLESPACE_OID { - let pg_version_str = match self.timeline.pg_version { - 14 | 15 => self.timeline.pg_version.to_string(), - ver => format!("{ver}\x0A"), - }; + let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) @@ -679,10 +677,7 @@ where if let Some(img) = relmap_img { let dst_path = format!("base/{dbnode}/PG_VERSION"); - let pg_version_str = match self.timeline.pg_version { - 14 | 15 => self.timeline.pg_version.to_string(), - ver => format!("{ver}\x0A"), - }; + let pg_version_str = self.timeline.pg_version.versionfile_string(); let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?; self.ar .append(&header, pg_version_str.as_bytes()) @@ -713,7 +708,7 @@ where buf.extend_from_slice(&img[..]); let crc = crc32c::crc32c(&img[..]); buf.put_u32_le(crc); - let path = if self.timeline.pg_version < 17 { + let path = if self.timeline.pg_version < PgMajorVersion::PG17 { format!("pg_twophase/{xid:>08X}") } else { format!("pg_twophase/{xid:>016X}") diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 3492a8d966..9952496061 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -11,7 +11,7 @@ use std::num::NonZeroUsize; use std::sync::Arc; use std::time::Duration; -use anyhow::{Context, bail, ensure}; +use anyhow::{Context, ensure}; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use pageserver_api::config::{ @@ -22,6 +22,7 @@ use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; use pem::Pem; use postgres_backend::AuthType; +use postgres_ffi::PgMajorVersion; use remote_storage::{RemotePath, RemoteStorageConfig}; use reqwest::Url; use storage_broker::Uri; @@ -338,20 +339,16 @@ impl PageServerConf { // // Postgres distribution paths // - pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { let path = self.pg_distrib_dir.clone(); - #[allow(clippy::manual_range_patterns)] - match pg_version { - 14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))), - _ => bail!("Unsupported postgres version: {}", pg_version), - } + Ok(path.join(pg_version.v_str())) } - pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("bin")) } - pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { + pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result { Ok(self.pg_distrib_dir(pg_version)?.join("lib")) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 349bc6dba6..3755cbda6a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -41,6 +41,7 @@ use pageserver_api::models::{ TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::{ShardCount, TenantShardId}; +use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError}; use scopeguard::defer; use serde_json::json; @@ -3385,7 +3386,7 @@ async fn put_tenant_timeline_import_basebackup( let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; - let pg_version: u32 = must_parse_query_param(&request, "pg_version")?; + let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?; check_permission(&request, Some(tenant_id))?; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ae7cbf1d6b..0dd3c465e0 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -38,6 +38,7 @@ pub mod walredo; use camino::Utf8Path; use deletion_queue::DeletionQueue; +use postgres_ffi::PgMajorVersion; use tenant::mgr::{BackgroundPurges, TenantManager}; use tenant::secondary; use tracing::{info, info_span}; @@ -51,7 +52,7 @@ use tracing::{info, info_span}; /// backwards-compatible changes to the metadata format. pub const STORAGE_FORMAT_VERSION: u16 = 3; -pub const DEFAULT_PG_VERSION: u32 = 17; +pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; // Magic constants used to identify different kinds of files pub const IMAGE_FILE_MAGIC: u16 = 0x5A60; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 180a5b76e8..09a7a8a651 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -25,7 +25,7 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; -use postgres_ffi::{BLCKSZ, TimestampTz, TransactionId}; +use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId}; use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi_types::{Oid, RepOriginId}; use serde::{Deserialize, Serialize}; @@ -1081,7 +1081,7 @@ impl Timeline { // fetch directory entry let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?; - if self.pg_version >= 17 { + if self.pg_version >= PgMajorVersion::PG17 { Ok(TwoPhaseDirectoryV17::des(&buf)?.xids) } else { Ok(TwoPhaseDirectory::des(&buf)? @@ -1613,7 +1613,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Db, MetricsUpdate::Set(0))); self.put(DBDIR_KEY, Value::Image(buf.into())); - let buf = if self.tline.pg_version >= 17 { + let buf = if self.tline.pg_version >= PgMajorVersion::PG17 { TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 { xids: HashSet::new(), }) @@ -1967,7 +1967,7 @@ impl DatadirModification<'_> { ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let newdirbuf = if self.tline.pg_version >= 17 { + let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { Err(WalIngestErrorKind::FileAlreadyExists(xid))?; @@ -2383,7 +2383,7 @@ impl DatadirModification<'_> { ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; - let newdirbuf = if self.tline.pg_version >= 17 { + let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 { let mut dir = TwoPhaseDirectoryV17::des(&buf)?; if !dir.xids.remove(&xid) { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a48bf15246..c71655ce17 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -38,6 +38,7 @@ use pageserver_api::models::{ WalRedoManagerStatus, }; use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; +use postgres_ffi::PgMajorVersion; use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel}; use remote_timeline_client::index::GcCompactionState; use remote_timeline_client::manifest::{ @@ -497,7 +498,7 @@ impl WalRedoManager { lsn: Lsn, base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { match self { @@ -933,7 +934,7 @@ pub(crate) enum CreateTimelineParams { pub(crate) struct CreateTimelineParamsBootstrap { pub(crate) new_timeline_id: TimelineId, pub(crate) existing_initdb_timeline_id: Option, - pub(crate) pg_version: u32, + pub(crate) pg_version: PgMajorVersion, } /// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here. @@ -971,7 +972,7 @@ pub(crate) enum CreateTimelineIdempotency { /// NB: special treatment, see comment in [`Self`]. FailWithConflict, Bootstrap { - pg_version: u32, + pg_version: PgMajorVersion, }, /// NB: branches always have the same `pg_version` as their ancestor. /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`] @@ -2541,7 +2542,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> { anyhow::ensure!( @@ -2593,7 +2594,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, ) -> anyhow::Result> { let (uninit_tl, ctx) = self @@ -2632,7 +2633,7 @@ impl TenantShard { self: &Arc, new_timeline_id: TimelineId, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ctx: &RequestContext, in_memory_layer_desc: Vec, delta_layer_desc: Vec, @@ -2898,7 +2899,7 @@ impl TenantShard { Lsn(0), initdb_lsn, initdb_lsn, - 15, + PgMajorVersion::PG15, ); this.prepare_new_timeline( new_timeline_id, @@ -5090,7 +5091,7 @@ impl TenantShard { pub(crate) async fn bootstrap_timeline_test( self: &Arc, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> anyhow::Result> { @@ -5232,7 +5233,7 @@ impl TenantShard { async fn bootstrap_timeline( self: &Arc, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, load_existing_initdb: Option, ctx: &RequestContext, ) -> Result { @@ -5770,7 +5771,7 @@ impl TenantShard { async fn run_initdb( conf: &'static PageServerConf, initdb_target_dir: &Utf8Path, - pg_version: u32, + pg_version: PgMajorVersion, cancel: &CancellationToken, ) -> Result<(), InitdbError> { let initdb_bin_path = conf @@ -6051,7 +6052,7 @@ pub(crate) mod harness { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - _pg_version: u32, + _pg_version: PgMajorVersion, _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); @@ -6223,7 +6224,7 @@ mod tests { async fn randomize_timeline( tenant: &Arc, new_timeline_id: TimelineId, - pg_version: u32, + pg_version: PgMajorVersion, spec: TestTimelineSpecification, random: &mut rand::rngs::StdRng, ctx: &RequestContext, diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 5081d7f5a4..2f407de951 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -18,6 +18,7 @@ //! [`IndexPart`]: super::remote_timeline_client::index::IndexPart use anyhow::ensure; +use postgres_ffi::PgMajorVersion; use serde::{Deserialize, Serialize}; use utils::bin_ser::{BeSer, SerializeError}; use utils::id::TimelineId; @@ -136,7 +137,7 @@ struct TimelineMetadataBodyV2 { latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -167,7 +168,7 @@ impl TimelineMetadata { ancestor_lsn: Lsn, latest_gc_cutoff_lsn: Lsn, initdb_lsn: Lsn, - pg_version: u32, + pg_version: PgMajorVersion, ) -> Self { Self { hdr: TimelineMetadataHeader { @@ -215,7 +216,7 @@ impl TimelineMetadata { ancestor_lsn: body.ancestor_lsn, latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn, initdb_lsn: body.initdb_lsn, - pg_version: 14, // All timelines created before this version had pg_version 14 + pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 }; hdr.format_version = METADATA_FORMAT_VERSION; @@ -317,7 +318,7 @@ impl TimelineMetadata { self.body.initdb_lsn } - pub fn pg_version(&self) -> u32 { + pub fn pg_version(&self) -> PgMajorVersion { self.body.pg_version } @@ -331,7 +332,7 @@ impl TimelineMetadata { Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), Lsn::from_hex("00000000").unwrap(), - 0, + PgMajorVersion::PG14, ); let bytes = instance.to_bytes().unwrap(); Self::from_bytes(&bytes).unwrap() @@ -545,7 +546,7 @@ mod tests { Lsn(0), Lsn(0), Lsn(0), - 14, // All timelines created before this version had pg_version 14 + PgMajorVersion::PG14, // All timelines created before this version had pg_version 14 ); assert_eq!( @@ -565,7 +566,7 @@ mod tests { Lsn(0), // Updating this version to 17 will cause the test to fail at the // next assert_eq!(). - 16, + PgMajorVersion::PG16, ); let expected_bytes = vec![ /* TimelineMetadataHeader */ diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index a5cd8989aa..6060c42cbb 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -427,8 +427,8 @@ impl GcBlocking { #[cfg(test)] mod tests { + use postgres_ffi::PgMajorVersion; use std::str::FromStr; - use utils::id::TimelineId; use super::*; @@ -831,7 +831,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: None, @@ -893,7 +893,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")), @@ -957,7 +957,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1033,7 +1033,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1114,7 +1114,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1199,7 +1199,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), @@ -1287,7 +1287,7 @@ mod tests { Lsn::INVALID, Lsn::from_str("0/1696070").unwrap(), Lsn::from_str("0/1696070").unwrap(), - 14, + PgMajorVersion::PG14, ).with_recalculated_checksum().unwrap(), deleted_at: None, lineage: Default::default(), diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 0f31318f0c..c2f76c859c 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -1622,11 +1622,6 @@ impl DeltaLayerIterator<'_> { pub(crate) mod test { use std::collections::BTreeMap; - use bytes::Bytes; - use itertools::MinMaxResult; - use rand::prelude::{SeedableRng, SliceRandom, StdRng}; - use rand::{Rng, RngCore}; - use super::*; use crate::DEFAULT_PG_VERSION; use crate::context::DownloadBehavior; @@ -1636,6 +1631,11 @@ pub(crate) mod test { use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::timeline::layer_manager::LayerManagerLockHolder; use crate::tenant::{TenantShard, Timeline}; + use bytes::Bytes; + use itertools::MinMaxResult; + use postgres_ffi::PgMajorVersion; + use rand::prelude::{SeedableRng, SliceRandom, StdRng}; + use rand::{Rng, RngCore}; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1995,7 +1995,7 @@ pub(crate) mod test { let (tenant, ctx) = h.load().await; let ctx = &ctx; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx) + .create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx) .await .unwrap(); let ctx = &ctx.with_scope_timeline(&timeline); diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 313c133fa2..9bdce163c9 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -1,6 +1,7 @@ use std::time::UNIX_EPOCH; use pageserver_api::key::{CONTROLFILE_KEY, Key}; +use postgres_ffi::PgMajorVersion; use tokio::task::JoinSet; use utils::completion::{self, Completion}; use utils::id::TimelineId; @@ -45,7 +46,7 @@ async fn smoke_test() { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Default::default(), // in-memory layers Default::default(), @@ -256,7 +257,12 @@ async fn evict_and_wait_on_wanted_deleted() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); @@ -341,7 +347,12 @@ fn read_wins_pending_eviction() { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -474,7 +485,12 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -644,7 +660,12 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -730,7 +751,12 @@ async fn evict_and_wait_does_not_wait_for_download() { let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); let ctx = ctx.with_scope_timeline(&timeline); @@ -836,7 +862,12 @@ async fn eviction_cancellation_on_drop() { let (tenant, ctx) = h.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 81f2646e5a..4ca005bfd4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -58,7 +58,7 @@ use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; use postgres_connection::PgConnectionConfig; use postgres_ffi::v14::xlog_utils; -use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp}; +use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp}; use rand::Rng; use remote_storage::DownloadError; use serde_with::serde_as; @@ -225,7 +225,7 @@ pub struct Timeline { /// to shards, and is constant through the lifetime of this Timeline. shard_identity: ShardIdentity, - pub pg_version: u32, + pub pg_version: PgMajorVersion, /// The tuple has two elements. /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote). @@ -2913,7 +2913,7 @@ impl Timeline { shard_identity: ShardIdentity, walredo_mgr: Option>, resources: TimelineResources, - pg_version: u32, + pg_version: PgMajorVersion, state: TimelineState, attach_wal_lag_cooldown: Arc>, create_idempotency: crate::tenant::CreateTimelineIdempotency, @@ -7593,6 +7593,7 @@ mod tests { use std::sync::Arc; use pageserver_api::key::Key; + use postgres_ffi::PgMajorVersion; use std::iter::Iterator; use tracing::Instrument; use utils::id::TimelineId; @@ -7667,7 +7668,7 @@ mod tests { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, @@ -7803,7 +7804,7 @@ mod tests { .create_test_timeline_with_layers( TimelineId::generate(), Lsn(0x10), - 14, + PgMajorVersion::PG14, &ctx, Vec::new(), // in-memory layers delta_layers, @@ -7863,7 +7864,12 @@ mod tests { let (tenant, ctx) = harness.load().await; let timeline = tenant - .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) + .create_test_timeline( + TimelineId::generate(), + Lsn(0x10), + PgMajorVersion::PG14, + &ctx, + ) .await .unwrap(); diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index bf2d9875c1..98c44313f1 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use anyhow::Context; use bytes::Bytes; -use postgres_ffi::ControlFileData; +use postgres_ffi::{ControlFileData, PgMajorVersion}; use remote_storage::{ Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, RemoteStorageConfig, @@ -264,7 +264,7 @@ impl ControlFile { pub(crate) fn base_lsn(&self) -> Lsn { Lsn(self.control_file_data.checkPoint).align() } - pub(crate) fn pg_version(&self) -> u32 { + pub(crate) fn pg_version(&self) -> PgMajorVersion { self.try_pg_version() .expect("prepare() checks that try_pg_version doesn't error") } @@ -274,13 +274,14 @@ impl ControlFile { pub(crate) fn control_file_buf(&self) -> &Bytes { &self.control_file_buf } - fn try_pg_version(&self) -> anyhow::Result { + + fn try_pg_version(&self) -> anyhow::Result { Ok(match self.control_file_data.catalog_version_no { // thesea are from catversion.h - 202107181 => 14, - 202209061 => 15, - 202307071 => 16, - 202406281 => 17, + 202107181 => PgMajorVersion::PG14, + 202209061 => PgMajorVersion::PG15, + 202307071 => PgMajorVersion::PG16, + 202406281 => PgMajorVersion::PG17, catversion => { anyhow::bail!("unrecognized catalog version {catversion}") } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c452f48e40..a597aedee3 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -32,8 +32,8 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::walrecord::*; use postgres_ffi::{ - TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, - fsm_logical_to_physical, pg_constants, + PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, + enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants, }; use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use tracing::*; @@ -781,7 +781,7 @@ impl WalIngest { ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { - let xid: u64 = if modification.tline.pg_version >= 17 { + let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(xl_xid)? } else { xl_xid as u64 @@ -886,7 +886,7 @@ impl WalIngest { xl_xid, parsed.xid, lsn, ); - let xid: u64 = if modification.tline.pg_version >= 17 { + let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 { self.adjust_to_full_transaction_id(parsed.xid)? } else { parsed.xid as u64 @@ -1241,7 +1241,7 @@ impl WalIngest { if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - let oldest_active_xid = if pg_version >= 17 { + let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 { let mut oldest_active_full_xid = cp.nextXid.value; for xid in modification.tline.list_twophase_files(lsn, ctx).await? { if xid < oldest_active_full_xid { @@ -1475,10 +1475,11 @@ impl WalIngest { const fn rate_limiter( &self, - pg_version: u32, + pg_version: PgMajorVersion, ) -> Option<&Lazy>> { - const MIN_PG_VERSION: u32 = 14; - const MAX_PG_VERSION: u32 = 17; + const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num(); + const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num(); + let pg_version = pg_version.major_version_num(); if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION { return None; @@ -1603,6 +1604,7 @@ async fn get_relsize( #[cfg(test)] mod tests { use anyhow::Result; + use postgres_ffi::PgMajorVersion; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1625,7 +1627,7 @@ mod tests { #[tokio::test] async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { - for i in 14..=16 { + for i in PgMajorVersion::ALL { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; }); @@ -2335,7 +2337,7 @@ mod tests { // 5. Grep sk logs for "restart decoder" to get startpoint // 6. Run just the decoder from this test to get the endpoint. // It's the last LSN the decoder will output. - let pg_version = 15; // The test data was generated by pg15 + let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15 let path = "test_data/sk_wal_segment_from_pgbench"; let wal_segment_path = format!("{path}/000000010000000000000001.zst"); let source_initdb_path = format!("{path}/{INITDB_PATH}"); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 1498f3c83d..b17b5a15f9 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -33,6 +33,7 @@ use bytes::{Bytes, BytesMut}; use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; use pageserver_api::shard::TenantShardId; +use postgres_ffi::PgMajorVersion; use tracing::*; use utils::lsn::Lsn; use utils::sync::gate::GateError; @@ -165,7 +166,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, + pg_version: PgMajorVersion, redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { @@ -232,7 +233,7 @@ impl PostgresRedoManager { /// # Cancel-Safety /// /// This method is cancellation-safe. - pub async fn ping(&self, pg_version: u32) -> Result<(), Error> { + pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> { self.do_with_walredo_process(pg_version, |proc| async move { proc.ping(Duration::from_secs(1)) .await @@ -342,7 +343,7 @@ impl PostgresRedoManager { O, >( &self, - pg_version: u32, + pg_version: PgMajorVersion, closure: F, ) -> Result { let proc: Arc = match self.redo_process.get_or_init_detached().await { @@ -442,7 +443,7 @@ impl PostgresRedoManager { base_img_lsn: Lsn, records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, - pg_version: u32, + pg_version: PgMajorVersion, max_retry_attempts: u32, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); @@ -572,6 +573,7 @@ mod tests { use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; + use postgres_ffi::PgMajorVersion; use tracing::Instrument; use utils::id::TenantId; use utils::lsn::Lsn; @@ -586,7 +588,7 @@ mod tests { let h = RedoHarness::new().unwrap(); h.manager - .ping(14) + .ping(PgMajorVersion::PG14) .instrument(h.span()) .await .expect("ping should work"); @@ -612,7 +614,7 @@ mod tests { Lsn::from_str("0/16E2408").unwrap(), None, short_records(), - 14, + PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) @@ -641,7 +643,7 @@ mod tests { Lsn::from_str("0/16E2408").unwrap(), None, short_records(), - 14, + PgMajorVersion::PG14, RedoAttemptType::ReadPage, ) .instrument(h.span()) @@ -663,7 +665,7 @@ mod tests { Lsn::INVALID, None, short_records(), - 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ + PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ RedoAttemptType::ReadPage, ) .instrument(h.span()) diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index 3dec0593bf..c8b0846480 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -12,7 +12,7 @@ use anyhow::Context; use bytes::Bytes; use pageserver_api::reltag::RelTag; use pageserver_api::shard::TenantShardId; -use postgres_ffi::BLCKSZ; +use postgres_ffi::{BLCKSZ, PgMajorVersion}; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tracing::{Instrument, debug, error, instrument}; use utils::lsn::Lsn; @@ -54,11 +54,11 @@ impl WalRedoProcess { // // Start postgres binary in special WAL redo mode. // - #[instrument(skip_all,fields(pg_version=pg_version))] + #[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))] pub(crate) fn launch( conf: &'static PageServerConf, tenant_shard_id: TenantShardId, - pg_version: u32, + pg_version: PgMajorVersion, ) -> anyhow::Result { crate::span::debug_assert_current_span_has_tenant_id(); diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 0a8cc415be..6955028c73 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -58,6 +58,7 @@ metrics.workspace = true pem.workspace = true postgres_backend.workspace = true postgres_ffi.workspace = true +postgres_versioninfo.workspace = true pq_proto.workspace = true remote_storage.workspace = true safekeeper_api.workspace = true diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 1ad9e62f9b..555cbe457b 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -2,6 +2,7 @@ use std::vec; use anyhow::{Result, bail}; +use postgres_versioninfo::PgVersionId; use pq_proto::SystemId; use safekeeper_api::membership::{Configuration, INVALID_GENERATION}; use safekeeper_api::{ServerInfo, Term}; @@ -46,7 +47,7 @@ struct SafeKeeperStateV1 { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV2 { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub tenant_id: TenantId, pub timeline_id: TimelineId, @@ -75,7 +76,7 @@ pub struct SafeKeeperStateV2 { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ServerInfoV3 { /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, #[serde(with = "hex")] pub tenant_id: TenantId, @@ -444,13 +445,13 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result TimelinePersisten mod tests { use std::str::FromStr; + use postgres_versioninfo::PgMajorVersion; use utils::Hex; use utils::id::NodeId; @@ -563,7 +565,7 @@ mod tests { epoch: 43, }, server: ServerInfoV2 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -586,8 +588,8 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // epoch 0x2b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - // pg_version - 0x0e, 0x00, 0x00, 0x00, + // pg_version = 140000 + 0xE0, 0x22, 0x02, 0x00, // system_id 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, // tenant_id @@ -626,7 +628,7 @@ mod tests { }]), }, server: ServerInfoV2 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -646,7 +648,7 @@ mod tests { let expected = [ 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef, 0xaa, 0x5e, 0xcf, 0x96, 0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5, 0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4, 0x78, 0x56, 0x34, 0x12, 0xc4, 0x7a, 0x42, 0xa5, @@ -675,7 +677,7 @@ mod tests { }]), }, server: ServerInfoV3 { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, tenant_id, timeline_id, @@ -695,7 +697,7 @@ mod tests { let expected = [ 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, + 0x00, 0x00, 0x00, 0x00, 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, @@ -731,7 +733,7 @@ mod tests { }]), }, server: ServerInfo { - pg_version: 14, + pg_version: PgVersionId::from(PgMajorVersion::PG14), system_id: 0x1234567887654321, wal_seg_size: 0x12345678, }, @@ -765,7 +767,7 @@ mod tests { 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, + 0xE0, 0x22, 0x02, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31, diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index aa19b6d283..4d15fc9de3 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -9,6 +9,7 @@ use anyhow::{Context, Result, bail}; use byteorder::{LittleEndian, ReadBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use postgres_ffi::{MAX_SEND_SIZE, TimeLineID}; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use pq_proto::SystemId; use safekeeper_api::membership::{ INVALID_GENERATION, MemberSet, SafekeeperGeneration as Generation, SafekeeperId, @@ -29,7 +30,7 @@ use crate::{control_file, wal_storage}; pub const SK_PROTO_VERSION_2: u32 = 2; pub const SK_PROTO_VERSION_3: u32 = 3; -pub const UNKNOWN_SERVER_VERSION: u32 = 0; +pub const UNKNOWN_SERVER_VERSION: PgVersionId = PgVersionId::UNKNOWN; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] pub struct TermLsn { @@ -218,7 +219,7 @@ pub struct ProposerGreeting { pub timeline_id: TimelineId, pub mconf: membership::Configuration, /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub system_id: SystemId, pub wal_seg_size: u32, } @@ -229,7 +230,7 @@ pub struct ProposerGreetingV2 { /// proposer-acceptor protocol version pub protocol_version: u32, /// Postgres server version - pub pg_version: u32, + pub pg_version: PgVersionId, pub proposer_id: PgUuid, pub system_id: SystemId, pub timeline_id: TimelineId, @@ -511,7 +512,7 @@ impl ProposerAcceptorMessage { tenant_id, timeline_id, mconf, - pg_version, + pg_version: PgVersionId::from_full_pg_version(pg_version), system_id, wal_seg_size, }; @@ -961,7 +962,8 @@ where * because safekeepers parse WAL headers and the format * may change between versions. */ - if msg.pg_version / 10000 != self.state.server.pg_version / 10000 + if PgMajorVersion::try_from(msg.pg_version)? + != PgMajorVersion::try_from(self.state.server.pg_version)? && self.state.server.pg_version != UNKNOWN_SERVER_VERSION { bail!( @@ -1748,7 +1750,7 @@ mod tests { }]), }, server: ServerInfo { - pg_version: 14, + pg_version: PgVersionId::from_full_pg_version(140000), system_id: 0x1234567887654321, wal_seg_size: 0x12345678, }, diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index 2b1fd7b854..2192f5eab4 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -8,8 +8,8 @@ use futures::StreamExt; use futures::future::Either; use pageserver_api::shard::ShardIdentity; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend}; -use postgres_ffi::get_current_timestamp; use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder}; +use postgres_ffi::{PgMajorVersion, get_current_timestamp}; use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; @@ -78,7 +78,7 @@ pub(crate) struct InterpretedWalReader { shard_senders: HashMap>, shard_notification_rx: Option>, state: Arc>, - pg_version: u32, + pg_version: PgMajorVersion, } /// A handle for [`InterpretedWalReader`] which allows for interacting with it @@ -258,7 +258,7 @@ impl InterpretedWalReader { start_pos: Lsn, tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, appname: &Option, ) -> InterpretedWalReaderHandle { let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running { @@ -322,7 +322,7 @@ impl InterpretedWalReader { start_pos: Lsn, tx: tokio::sync::mpsc::Sender, shard: ShardIdentity, - pg_version: u32, + pg_version: PgMajorVersion, shard_notification_rx: Option< tokio::sync::mpsc::UnboundedReceiver, >, @@ -718,7 +718,7 @@ mod tests { use std::time::Duration; use pageserver_api::shard::{ShardIdentity, ShardStripeSize}; - use postgres_ffi::MAX_SEND_SIZE; + use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion}; use tokio::sync::mpsc::error::TryRecvError; use utils::id::{NodeId, TenantTimelineId}; use utils::lsn::Lsn; @@ -734,7 +734,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 200; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); @@ -876,7 +876,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 200; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); @@ -1025,7 +1025,7 @@ mod tests { const SIZE: usize = 64 * 1024; const MSG_COUNT: usize = 10; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; const SHARD_COUNT: u8 = 2; const WAL_READER_BATCH_SIZE: usize = 8192; @@ -1148,7 +1148,7 @@ mod tests { const SIZE: usize = 8 * 1024; const MSG_COUNT: usize = 10; - const PG_VERSION: u32 = 17; + const PG_VERSION: PgMajorVersion = PgMajorVersion::PG17; let start_lsn = Lsn::from_str("0/149FD18").unwrap(); let env = Env::new(true).unwrap(); diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 05f827494e..177e759db5 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -12,7 +12,7 @@ use futures::FutureExt; use itertools::Itertools; use parking_lot::Mutex; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError}; -use postgres_ffi::{MAX_SEND_SIZE, TimestampTz, get_current_timestamp}; +use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, TimestampTz, get_current_timestamp}; use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody}; use safekeeper_api::Term; use safekeeper_api::models::{ @@ -559,7 +559,9 @@ impl SafekeeperPostgresHandler { format, compression, } => { - let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000; + let pg_version = + PgMajorVersion::try_from(tli.tli.get_state().await.1.server.pg_version) + .unwrap(); let end_watch_view = end_watch.view(); let wal_residence_guard = tli.wal_residence_guard().await?; let (tx, rx) = tokio::sync::mpsc::channel::(2); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 7533005c35..b6cf73be2e 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -7,6 +7,7 @@ use std::time::SystemTime; use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use safekeeper_api::membership::Configuration; use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; @@ -149,8 +150,8 @@ impl TimelinePersistentState { &TenantTimelineId::empty(), Configuration::empty(), ServerInfo { - pg_version: 170000, /* Postgres server version (major * 10000) */ - system_id: 0, /* Postgres system identifier */ + pg_version: PgVersionId::from(PgMajorVersion::PG17), + system_id: 0, /* Postgres system identifier */ wal_seg_size: WAL_SEGMENT_SIZE as u32, }, Lsn::INVALID, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index e68c9f3a99..da00df2dd7 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -19,6 +19,7 @@ use futures::future::BoxFuture; use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName}; use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; +use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use pq_proto::SystemId; use remote_storage::RemotePath; use std::sync::Arc; @@ -92,7 +93,7 @@ pub struct PhysicalStorage { /// Size of WAL segment in bytes. wal_seg_size: usize, - pg_version: u32, + pg_version: PgVersionId, system_id: u64, /// Written to disk, but possibly still in the cache and not fully persisted. @@ -180,7 +181,7 @@ impl PhysicalStorage { let write_lsn = if state.commit_lsn == Lsn(0) { Lsn(0) } else { - let version = state.server.pg_version / 10000; + let version = PgMajorVersion::try_from(state.server.pg_version).unwrap(); dispatch_pgversion!( version, @@ -226,7 +227,10 @@ impl PhysicalStorage { write_record_lsn: write_lsn, flush_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000), + decoder: WalStreamDecoder::new( + write_lsn, + PgMajorVersion::try_from(state.server.pg_version).unwrap(), + ), file: None, pending_wal_truncation: true, }) @@ -408,7 +412,7 @@ impl Storage for PhysicalStorage { let segno = init_lsn.segment_number(self.wal_seg_size); let (mut file, _) = self.open_or_create(segno).await?; - let major_pg_version = self.pg_version / 10000; + let major_pg_version = PgMajorVersion::try_from(self.pg_version).unwrap(); let wal_seg = postgres_ffi::generate_wal_segment(segno, self.system_id, major_pg_version, init_lsn)?; file.seek(SeekFrom::Start(0)).await?; @@ -654,7 +658,7 @@ pub struct WalReader { // pos is in the same segment as timeline_start_lsn. timeline_start_lsn: Lsn, // integer version number of PostgreSQL, e.g. 14; 15; 16 - pg_version: u32, + pg_version: PgMajorVersion, system_id: SystemId, timeline_start_segment: Option, } @@ -697,7 +701,7 @@ impl WalReader { wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, - pg_version: state.server.pg_version / 10000, + pg_version: PgMajorVersion::try_from(state.server.pg_version).unwrap(), system_id: state.server.system_id, timeline_start_segment: None, }) diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index 94a849b5f0..029f8fab0a 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -7,8 +7,8 @@ use anyhow::Result; use bytes::{Buf, BytesMut}; use futures::future::BoxFuture; use parking_lot::Mutex; -use postgres_ffi::XLogSegNo; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::{PgMajorVersion, XLogSegNo}; use safekeeper::metrics::WalStorageMetrics; use safekeeper::state::TimelinePersistentState; use safekeeper::{control_file, wal_storage}; @@ -142,7 +142,7 @@ impl DiskWALStorage { write_lsn, write_record_lsn: flush_lsn, flush_record_lsn: flush_lsn, - decoder: WalStreamDecoder::new(flush_lsn, 16), + decoder: WalStreamDecoder::new(flush_lsn, PgMajorVersion::PG16), unflushed_bytes: BytesMut::new(), disk, }) @@ -151,7 +151,7 @@ impl DiskWALStorage { fn find_end_of_wal(disk: Arc, start_lsn: Lsn) -> Result { let mut buf = [0; 8192]; let mut pos = start_lsn.0; - let mut decoder = WalStreamDecoder::new(start_lsn, 16); + let mut decoder = WalStreamDecoder::new(start_lsn, PgMajorVersion::PG16); let mut result = start_lsn; loop { disk.wal.lock().read(pos, &mut buf); @@ -204,7 +204,7 @@ impl wal_storage::Storage for DiskWALStorage { self.decoder.available(), startpos, ); - self.decoder = WalStreamDecoder::new(startpos, 16); + self.decoder = WalStreamDecoder::new(startpos, PgMajorVersion::PG16); } self.decoder.feed_bytes(buf); loop { @@ -242,7 +242,7 @@ impl wal_storage::Storage for DiskWALStorage { self.write_record_lsn = end_pos; self.flush_record_lsn = end_pos; self.unflushed_bytes.clear(); - self.decoder = WalStreamDecoder::new(end_pos, 16); + self.decoder = WalStreamDecoder::new(end_pos, PgMajorVersion::PG16); Ok(()) } diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 193a1833a7..fec81fb661 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -18,6 +18,7 @@ use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; +use safekeeper_api::PgVersionId; use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; @@ -44,7 +45,7 @@ impl Service { &self, tenant_id: TenantId, timeline_id: TimelineId, - pg_version: u32, + pg_version: PgVersionId, timeline_persistence: &TimelinePersistence, ) -> Result, ApiError> { // If quorum is reached, return if we are outside of a specified timeout @@ -219,7 +220,7 @@ impl Service { read_only: bool, ) -> Result { let timeline_id = timeline_info.timeline_id; - let pg_version = timeline_info.pg_version * 10000; + let pg_version = PgVersionId::from(timeline_info.pg_version); // Initially start_lsn is determined by last_record_lsn in pageserver // response as it does initdb. However, later we persist it and in sk // creation calls replace with the value from the timeline row if it diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index fa5c9aa693..920c538069 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -172,7 +172,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.initial_tenant, env.initial_timeline, MembershipConfiguration(generation=1, members=[sk.safekeeper_id()], new_members=None), - int(env.pg_version), + int(env.pg_version) * 10000, Lsn(0), None, ) From aa7572201097d3b36e09cdeb74b35b7de13344c0 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 24 Jun 2025 12:42:23 -0500 Subject: [PATCH 11/50] Set pgaudit.log=none for monitoring connections (#12137) pgaudit can spam logs due to all the monitoring that we do. Logs from these connections are not necessary for HIPPA compliance, so we can stop logging from those connections. Part-of: https://github.com/neondatabase/cloud/issues/29574 Signed-off-by: Tristan Partin --- compute/Makefile | 4 ++-- compute/vm-image-spec-bookworm.yaml | 4 ++-- compute/vm-image-spec-bullseye.yaml | 4 ++-- compute_tools/src/compute.rs | 2 +- test_runner/regress/test_compute_metrics.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compute/Makefile b/compute/Makefile index c53d040887..ef2e55f7b1 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -22,7 +22,7 @@ sql_exporter.yml: $(jsonnet_files) --output-file etc/$@ \ --tla-str collector_name=neon_collector \ --tla-str collector_file=neon_collector.yml \ - --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \ + --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter&pgaudit.log=none' \ etc/sql_exporter.jsonnet sql_exporter_autoscaling.yml: $(jsonnet_files) @@ -30,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files) --output-file etc/$@ \ --tla-str collector_name=neon_collector_autoscaling \ --tla-str collector_file=neon_collector_autoscaling.yml \ - --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \ + --tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling&pgaudit.log=none' \ etc/sql_exporter.jsonnet .PHONY: clean diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 057099994a..267e4c83b5 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn @@ -59,7 +59,7 @@ files: # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn - + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index d048e20b2e..2b6e77b656 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -26,7 +26,7 @@ commands: - name: postgres-exporter user: nobody sysvInitAction: respawn - shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter pgaudit.log=none" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml' - name: pgbouncer-exporter user: postgres sysvInitAction: respawn @@ -59,7 +59,7 @@ files: # the rules use ALL as the hostname. Avoid the pointless lookups and the "unable to # resolve host" log messages that they generate. Defaults !fqdn - + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), # regardless of hostname (ALL) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index f84a5f0841..70b2d28bf2 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -405,7 +405,7 @@ impl ComputeNode { // that can affect `compute_ctl` and prevent it from properly configuring the database schema. // Unset them via connection string options before connecting to the database. // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`. - const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0"; + const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none"; let options = match conn_conf.get_options() { // Allow the control plane to override any options set by the // compute diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index c751a3e7cc..d1e61e597c 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -418,7 +418,7 @@ def test_sql_exporter_metrics_e2e( pg_user = conn_options["user"] pg_dbname = conn_options["dbname"] pg_application_name = f"sql_exporter{stem_suffix}" - connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}" + connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}&pgaudit.log=none" def escape_go_filepath_match_characters(s: str) -> str: """ From a2d623696c91c1ec832710f6a839eae2c6087d05 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 25 Jun 2025 04:03:02 -0500 Subject: [PATCH 12/50] Update pgaudit to latest versions (#12328) These updates contain some bug fixes and are completely backwards compatible with what we currently support in Neon. Link: https://github.com/pgaudit/pgaudit/compare/1.6.2...1.6.3 Link: https://github.com/pgaudit/pgaudit/compare/1.7.0...1.7.1 Link: https://github.com/pgaudit/pgaudit/compare/16.0...16.1 Link: https://github.com/pgaudit/pgaudit/compare/17.0...17.1 Signed-off-by: Tristan Partin Signed-off-by: Tristan Partin --- compute/compute-node.Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 7cd152f614..cf0598eec8 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1568,20 +1568,20 @@ ARG PG_VERSION WORKDIR /ext-src RUN case "${PG_VERSION}" in \ "v14") \ - export PGAUDIT_VERSION=1.6.2 \ - export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \ + export PGAUDIT_VERSION=1.6.3 \ + export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \ ;; \ "v15") \ - export PGAUDIT_VERSION=1.7.0 \ - export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \ + export PGAUDIT_VERSION=1.7.1 \ + export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \ ;; \ "v16") \ - export PGAUDIT_VERSION=16.0 \ - export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \ + export PGAUDIT_VERSION=16.1 \ + export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \ ;; \ "v17") \ - export PGAUDIT_VERSION=17.0 \ - export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \ + export PGAUDIT_VERSION=17.1 \ + export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \ ;; \ *) \ echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \ From 7c4c36f5ac236e56f311d412d98b208342c57fa8 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 25 Jun 2025 13:47:56 +0300 Subject: [PATCH 13/50] Remove unnecessary separate installation of libpq (#12287) `make install` compiles and installs libpq. Remove redundant separate step to compile and install it. --- Makefile | 2 -- compute/compute-node.Dockerfile | 3 --- 2 files changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 9824a47255..d39b9b68c8 100644 --- a/Makefile +++ b/Makefile @@ -159,8 +159,6 @@ postgres-%: postgres-configure-% \ postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` +@echo "Compiling PostgreSQL $*" $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install - +@echo "Compiling libpq $*" - $(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install +@echo "Compiling pg_prewarm $*" $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index cf0598eec8..35ece73030 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -171,9 +171,6 @@ RUN cd postgres && \ eval $CONFIGURE_CMD && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ - # Install headers - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \ - make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \ # Enable some of contrib extensions echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \ From 1dc01c9bed24846473f04d983d36f7da3f3c04c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 25 Jun 2025 15:40:38 +0200 Subject: [PATCH 14/50] Support cancellations of timelines with hanging ondemand downloads (#12330) In `test_layer_download_cancelled_by_config_location`, we simulate hung downloads via the `before-downloading-layer-stream-pausable` failpoint. Then, we cancel a timeline via the `location_config` endpoint. With the new default as of https://github.com/neondatabase/neon/pull/11712, we would be creating the timeline on safekeepers regardless if there have been writes or not, and it turns out the test relied on the timeline not existing on safekeepers, due to a cancellation bug: * as established before, the test makes the read path hang * the timeline cancellation function first cancels the walreceiver, and only then cancels the timeline's token * `WalIngest::new` is requesting a checkpoint, which hits the read path * at cancellation time, we'd be hanging inside the read, not seeing the cancellation of the walreceiver * the test would time out due to the hang This is probably also reproducible in the wild when there is S3 unavailabilies or bottlenecks. So we thought that it's worthwhile to fix the hang issue. The approach chosen in the end involves the `tokio::select` macro. In PR 11712, we originally punted on the test due to the hang and opted it out from the new default, but now we can use the new default. Part of https://github.com/neondatabase/neon/issues/12299 --- .../walreceiver/walreceiver_connection.rs | 20 +++++++++++++------ test_runner/regress/test_ondemand_download.py | 6 ------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index e91bd5d43a..6d52da1f00 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection( let copy_stream = replication_client.copy_both_simple(&query).await?; let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) - .await - .map_err(|e| match e.kind { - crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, - _ => WalReceiverError::Other(e.into()), - })?; + let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx); + let walingest_res = select! { + walingest_res = walingest_future => walingest_res, + _ = cancellation.cancelled() => { + // We are doing reads in WalIngest::new, and those can hang as they come from the network. + // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one. + debug!("Connection cancelled"); + return Err(WalReceiverError::Cancelled); + }, + }; + let mut walingest = walingest_res.map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let (format, compression) = match protocol { PostgresClientProtocol::Interpreted { diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 2b71662669..2590a3fe9d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -671,12 +671,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu """ neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - # On the new mode, the test runs into a cancellation issue, i.e. the walproposer can't shut down - # as it is hang-waiting on the timeline_checkpoint call in WalIngest::new. - neon_env_builder.storage_controller_config = { - "timelines_onto_safekeepers": False, - } - # turn off background tasks so that they don't interfere with the downloads env = neon_env_builder.init_start( initial_tenant_conf={ From 27ca1e21bec2fc90311ec3ac1cad69bced69dd6f Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 25 Jun 2025 14:41:30 +0100 Subject: [PATCH 15/50] [console_redirect_proxy]: fix channel binding (#12238) ## Problem While working more on TLS to compute, I realised that Console Redirect -> pg-sni-router -> compute would break if channel binding was set to prefer. This is because the channel binding data would differ between Console Redirect -> pg-sni-router vs pg-sni-router -> compute. I also noticed that I actually disabled channel binding in #12145, since `connect_raw` would think that the connection didn't support TLS. ## Summary of changes Make sure we specify the channel binding. Make sure that `connect_raw` can see if we have TLS support. --- libs/proxy/tokio-postgres2/src/config.rs | 20 ++++++++++++++++--- libs/proxy/tokio-postgres2/src/connect.rs | 4 +++- libs/proxy/tokio-postgres2/src/connect_raw.rs | 12 ++++------- proxy/src/compute/mod.rs | 12 +++++++++-- proxy/src/proxy/tests/mitm.rs | 4 ++-- proxy/src/proxy/tests/mod.rs | 12 +++++------ 6 files changed, 42 insertions(+), 22 deletions(-) diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 243a5bc725..961cbc923e 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -12,7 +12,9 @@ use tokio::net::TcpStream; use crate::connect::connect; use crate::connect_raw::{RawConnection, connect_raw}; -use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::connect_tls::connect_tls; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream}; use crate::{Client, Connection, Error}; /// TLS configuration. @@ -238,7 +240,7 @@ impl Config { connect(tls, self).await } - pub async fn connect_raw( + pub async fn tls_and_authenticate( &self, stream: S, tls: T, @@ -247,7 +249,19 @@ impl Config { S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, { - connect_raw(stream, tls, self).await + let stream = connect_tls(stream, self.ssl_mode, tls).await?; + connect_raw(stream, self).await + } + + pub async fn authenticate( + &self, + stream: MaybeTlsStream, + ) -> Result, Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, + { + connect_raw(stream, self).await } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index f7bc863337..4a07eccf9a 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -9,6 +9,7 @@ use crate::codec::BackendMessage; use crate::config::Host; use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; +use crate::connect_tls::connect_tls; use crate::tls::{MakeTlsConnect, TlsConnect}; use crate::{Client, Config, Connection, Error, RawConnection}; @@ -44,13 +45,14 @@ where T: TlsConnect, { let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?; + let stream = connect_tls(socket, config.ssl_mode, tls).await?; let RawConnection { stream, parameters, delayed_notice, process_id, secret_key, - } = connect_raw(socket, tls, config).await?; + } = connect_raw(stream, config).await?; let socket_config = SocketConfig { host_addr, diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 20dc538cf2..b89a600a2e 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -16,9 +16,8 @@ use tokio_util::codec::Framed; use crate::Error; use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; use crate::config::{self, AuthKeys, Config}; -use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; -use crate::tls::{TlsConnect, TlsStream}; +use crate::tls::TlsStream; pub struct StartupStream { inner: Framed, PostgresCodec>, @@ -87,16 +86,13 @@ pub struct RawConnection { } pub async fn connect_raw( - stream: S, - tls: T, + stream: MaybeTlsStream, config: &Config, -) -> Result, Error> +) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, - T: TlsConnect, + T: TlsStream + Unpin, { - let stream = connect_tls(stream, config.ssl_mode, tls).await?; - let mut stream = StartupStream { inner: Framed::new(stream, PostgresCodec), buf: BackendMessages::empty(), diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index f6c58c7459..7fb88e6a45 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -6,7 +6,7 @@ use std::net::{IpAddr, SocketAddr}; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; -use postgres_client::config::{AuthKeys, SslMode}; +use postgres_client::config::{AuthKeys, ChannelBinding, SslMode}; use postgres_client::maybe_tls_stream::MaybeTlsStream; use postgres_client::tls::MakeTlsConnect; use postgres_client::{NoTls, RawCancelToken, RawConnection}; @@ -129,6 +129,8 @@ pub(crate) struct AuthInfo { auth: Option, server_params: StartupMessageParams, + channel_binding: ChannelBinding, + /// Console redirect sets user and database, we shouldn't re-use those from the params. skip_db_user: bool, } @@ -152,6 +154,8 @@ impl AuthInfo { auth: pw.map(|pw| Auth::Password(pw.as_bytes().to_owned())), server_params, skip_db_user: true, + // pg-sni-router is a mitm so this would fail. + channel_binding: ChannelBinding::Disable, } } @@ -165,6 +169,7 @@ impl AuthInfo { }, server_params: StartupMessageParams::default(), skip_db_user: false, + channel_binding: ChannelBinding::Prefer, } } } @@ -187,6 +192,7 @@ impl AuthInfo { Some(Auth::Password(pw)) => config.password(pw), None => &mut config, }; + config.channel_binding(self.channel_binding); for (k, v) in self.server_params.iter() { config.set_param(k, v); } @@ -241,7 +247,9 @@ impl AuthInfo { let tmp_config = self.enrich(tmp_config); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let connection = tmp_config.connect_raw(&mut compute.stream, NoTls).await?; + let connection = tmp_config + .tls_and_authenticate(&mut compute.stream, NoTls) + .await?; drop(pause); let RawConnection { diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index c92ee49b8d..67dd0ab522 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -169,7 +169,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -252,7 +252,7 @@ async fn connect_failure( .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() .context("client shouldn't be able to connect")?; diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 12de5cbc09..29a269208a 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -199,7 +199,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) - .connect_raw(server, NoTls) + .tls_and_authenticate(server, NoTls) .await .err() // -> Option .context("client shouldn't be able to connect")?; @@ -228,7 +228,7 @@ async fn handshake_tls() -> anyhow::Result<()> { .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -245,7 +245,7 @@ async fn handshake_raw() -> anyhow::Result<()> { .dbname("earth") .set_param("options", "project=generic-project-name") .ssl_mode(SslMode::Prefer) - .connect_raw(server, NoTls) + .tls_and_authenticate(server, NoTls) .await?; proxy.await? @@ -293,7 +293,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { .dbname("db") .password(password) .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -317,7 +317,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { .dbname("db") .password("password") .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await?; proxy.await? @@ -344,7 +344,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { .dbname("db") .password(&password) // no password will match the mocked secret .ssl_mode(SslMode::Require) - .connect_raw(server, client_config.make_tls_connect()?) + .tls_and_authenticate(server, client_config.make_tls_connect()?) .await .err() // -> Option .context("client shouldn't be able to connect")?; From 517a3d0d86303ee3084ffd99ffac7042e2eca5c2 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 25 Jun 2025 15:19:20 +0100 Subject: [PATCH 16/50] [proxy]: BatchQueue::call is not cancel safe - make it directly cancellation aware (#12345) ## Problem https://github.com/neondatabase/cloud/issues/30539 If the current leader cancels the `call` function, then it has removed the jobs from the queue, but will never finish sending the responses. Because of this, it is not cancellation safe. ## Summary of changes Document these functions as not cancellation safe. Move cancellation of the queued jobs into the queue itself. ## Alternatives considered 1. We could spawn the task that runs the batch, since that won't get cancelled. * This requires `fn call(self: Arc)` or `fn call(&'static self)`. 2. We could add another scopeguard and return the requests back to the queue. * This requires that requests are always retry safe, and also requires requests to be `Clone`. --- proxy/src/batch.rs | 80 +++++++++++++++++++++++++----------- proxy/src/cancellation.rs | 85 +++++++++++++++++++++++---------------- proxy/src/redis/keys.rs | 3 +- 3 files changed, 109 insertions(+), 59 deletions(-) diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs index 61bdf2b747..33e08797f2 100644 --- a/proxy/src/batch.rs +++ b/proxy/src/batch.rs @@ -6,7 +6,6 @@ use std::collections::BTreeMap; use std::pin::pin; use std::sync::Mutex; -use futures::future::Either; use scopeguard::ScopeGuard; use tokio::sync::oneshot::error::TryRecvError; @@ -49,37 +48,67 @@ impl BatchQueue

{ } } - pub async fn call(&self, req: P::Req) -> P::Res { + /// Perform a single request-response process, this may be batched internally. + /// + /// This function is not cancel safe. + pub async fn call( + &self, + req: P::Req, + cancelled: impl Future, + ) -> Result { let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req); - let guard = scopeguard::guard(id, move |id| { - let mut inner = self.inner.lock_propagate_poison(); - if inner.queue.remove(&id).is_some() { - tracing::debug!("batched task cancelled before completion"); - } - }); + let mut cancelled = pin!(cancelled); let resp = loop { // try become the leader, or try wait for success. - let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await - { - // we got the resp. - Either::Left((resp, _)) => break resp.ok(), - // we are the leader. - Either::Right((p, rx_)) => { - rx = rx_; - p - } + let mut processor = tokio::select! { + // try become leader. + p = self.processor.lock() => p, + // wait for success. + resp = &mut rx => break resp.ok(), + // wait for cancellation. + cancel = cancelled.as_mut() => { + let mut inner = self.inner.lock_propagate_poison(); + if inner.queue.remove(&id).is_some() { + tracing::warn!("batched task cancelled before completion"); + } + return Err(cancel); + }, }; + tracing::debug!(id, "batch: became leader"); let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor); + // snitch incase the task gets cancelled. + let cancel_safety = scopeguard::guard((), |()| { + if !std::thread::panicking() { + tracing::error!( + id, + "batch: leader cancelled, despite not being cancellation safe" + ); + } + }); + // apply a batch. + // if this is cancelled, jobs will not be completed and will panic. let values = processor.apply(reqs).await; + // good: we didn't get cancelled. + ScopeGuard::into_inner(cancel_safety); + + if values.len() != resps.len() { + tracing::error!( + "batch: invalid response size, expected={}, got={}", + resps.len(), + values.len() + ); + } + // send response values. for (tx, value) in std::iter::zip(resps, values) { - // sender hung up but that's fine. - drop(tx.send(value)); + if tx.send(value).is_err() { + // receiver hung up but that's fine. + } } match rx.try_recv() { @@ -98,10 +127,9 @@ impl BatchQueue

{ } }; - // already removed. - ScopeGuard::into_inner(guard); + tracing::debug!(id, "batch: job completed"); - resp.expect("no response found. batch processer should not panic") + Ok(resp.expect("no response found. batch processer should not panic")) } } @@ -125,6 +153,8 @@ impl BatchQueueInner

{ self.queue.insert(id, BatchJob { req, res: tx }); + tracing::debug!(id, "batch: registered job in the queue"); + (id, rx) } @@ -132,15 +162,19 @@ impl BatchQueueInner

{ let batch_size = p.batch_size(self.queue.len()); let mut reqs = Vec::with_capacity(batch_size); let mut resps = Vec::with_capacity(batch_size); + let mut ids = Vec::with_capacity(batch_size); while reqs.len() < batch_size { - let Some((_, job)) = self.queue.pop_first() else { + let Some((id, job)) = self.queue.pop_first() else { break; }; reqs.push(job.req); resps.push(job.res); + ids.push(id); } + tracing::debug!(ids=?ids, "batch: acquired jobs"); + (reqs, resps) } } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 036f36c7f6..ffc0cf43f1 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,5 +1,6 @@ use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; +use std::pin::pin; use std::sync::{Arc, OnceLock}; use std::time::Duration; @@ -98,7 +99,6 @@ impl Pipeline { impl CancelKeyOp { fn register(&self, pipe: &mut Pipeline) { - #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { key, value, expire } => { let key = KeyPrefix::Cancel(*key).build_redis_key(); @@ -224,6 +224,7 @@ impl CancellationHandler { } } + /// This is not cancel safe async fn get_cancel_key( &self, key: CancelKeyData, @@ -240,16 +241,21 @@ impl CancellationHandler { }; const TIMEOUT: Duration = Duration::from_secs(5); - let result = timeout(TIMEOUT, tx.call((guard, op))) - .await - .map_err(|_| { - tracing::warn!("timed out waiting to receive GetCancelData response"); - CancelError::RateLimit - })? - .map_err(|e| { - tracing::warn!("failed to receive GetCancelData response: {e}"); - CancelError::InternalError - })?; + let result = timeout( + TIMEOUT, + tx.call((guard, op), std::future::pending::()), + ) + .await + .map_err(|_| { + tracing::warn!("timed out waiting to receive GetCancelData response"); + CancelError::RateLimit + })? + // cannot be cancelled + .unwrap_or_else(|x| match x {}) + .map_err(|e| { + tracing::warn!("failed to receive GetCancelData response: {e}"); + CancelError::InternalError + })?; let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| { tracing::warn!("failed to receive GetCancelData response: {e}"); @@ -271,6 +277,8 @@ impl CancellationHandler { /// Will fetch IP allowlist internally. /// /// return Result primarily for tests + /// + /// This is not cancel safe pub(crate) async fn cancel_session( &self, key: CancelKeyData, @@ -394,6 +402,8 @@ impl Session { /// Ensure the cancel key is continously refreshed, /// but stop when the channel is dropped. + /// + /// This is not cancel safe pub(crate) async fn maintain_cancel_key( &self, session_id: uuid::Uuid, @@ -401,27 +411,6 @@ impl Session { cancel_closure: &CancelClosure, compute_config: &ComputeConfig, ) { - futures::future::select( - std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)), - cancel, - ) - .await; - - if let Err(err) = cancel_closure - .try_cancel_query(compute_config) - .boxed() - .await - { - tracing::warn!( - ?session_id, - ?err, - "could not cancel the query in the database" - ); - } - } - - // Ensure the cancel key is continously refreshed. - async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! { let Some(tx) = self.cancellation_handler.tx.get() else { tracing::warn!("cancellation handler is not available"); // don't exit, as we only want to exit if cancelled externally. @@ -432,6 +421,8 @@ impl Session { .expect("serialising to json string should not fail") .into_boxed_str(); + let mut cancel = pin!(cancel); + loop { let guard = Metrics::get() .proxy @@ -449,9 +440,35 @@ impl Session { "registering cancellation key" ); - if tx.call((guard, op)).await.is_ok() { - tokio::time::sleep(CANCEL_KEY_REFRESH).await; + match tx.call((guard, op), cancel.as_mut()).await { + Ok(Ok(_)) => { + tracing::debug!( + src=%self.key, + dest=?cancel_closure.cancel_token, + "registered cancellation key" + ); + + // wait before continuing. + tokio::time::sleep(CANCEL_KEY_REFRESH).await; + } + // retry immediately. + Ok(Err(error)) => { + tracing::warn!(?error, "error registering cancellation key"); + } + Err(Err(_cancelled)) => break, } } + + if let Err(err) = cancel_closure + .try_cancel_query(compute_config) + .boxed() + .await + { + tracing::warn!( + ?session_id, + ?err, + "could not cancel the query in the database" + ); + } } } diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs index b453e6851c..ffb7bc876b 100644 --- a/proxy/src/redis/keys.rs +++ b/proxy/src/redis/keys.rs @@ -23,9 +23,8 @@ impl KeyPrefix { #[cfg(test)] mod tests { - use crate::pqproto::id_to_cancel_key; - use super::*; + use crate::pqproto::id_to_cancel_key; #[test] fn test_build_redis_key() { From 6c77638ea15be150a128ce3d09823dfaafb966fc Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 25 Jun 2025 10:58:18 -0400 Subject: [PATCH 17/50] feat(storcon): retrieve feature flag and pass to pageservers (#12324) ## Problem part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes It costs $$$ to directly retrieve the feature flags from the pageserver. Therefore, this patch adds new APIs to retrieve the spec from the storcon and updates it via pageserver. * Storcon retrieves the feature flag and send it to the pageservers. * If the feature flag gets updated outside of the normal refresh loop of the pageserver, pageserver won't fetch the flags on its own as long as the last updated time <= refresh_period. Signed-off-by: Alex Chi Z --- Cargo.lock | 2 + control_plane/src/local_env.rs | 4 + control_plane/src/storage_controller.rs | 17 ++- libs/pageserver_api/src/config.rs | 7 +- .../src/background_loop.rs | 61 ++++++--- libs/posthog_client_lite/src/lib.rs | 30 +++-- pageserver/client/src/mgmt_api.rs | 9 ++ pageserver/src/feature_resolver.rs | 7 ++ pageserver/src/http/routes.rs | 17 +++ storage_controller/Cargo.toml | 2 + storage_controller/src/main.rs | 40 ++++++ storage_controller/src/pageserver_client.rs | 9 ++ storage_controller/src/service.rs | 5 + .../src/service/feature_flag.rs | 117 ++++++++++++++++++ 14 files changed, 294 insertions(+), 33 deletions(-) create mode 100644 storage_controller/src/service/feature_flag.rs diff --git a/Cargo.lock b/Cargo.lock index 51724da061..1fee728d9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6815,6 +6815,7 @@ dependencies = [ "hex", "http-utils", "humantime", + "humantime-serde", "hyper 0.14.30", "itertools 0.10.5", "json-structural-diff", @@ -6825,6 +6826,7 @@ dependencies = [ "pageserver_api", "pageserver_client", "postgres_connection", + "posthog_client_lite", "rand 0.8.5", "regex", "reqwest", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 370921a85c..16cd2d8c08 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -12,6 +12,7 @@ use std::{env, fs}; use anyhow::{Context, bail}; use clap::ValueEnum; +use pageserver_api::config::PostHogConfig; use pem::Pem; use postgres_backend::AuthType; use reqwest::{Certificate, Url}; @@ -213,6 +214,8 @@ pub struct NeonStorageControllerConf { pub timeline_safekeeper_count: Option, + pub posthog_config: Option, + pub kick_secondary_downloads: Option, } @@ -245,6 +248,7 @@ impl Default for NeonStorageControllerConf { use_https_safekeeper_api: false, use_local_compute_notifications: true, timeline_safekeeper_count: None, + posthog_config: None, kick_secondary_downloads: None, } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 218b17d88d..dea7ae2ccf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -642,6 +642,18 @@ impl StorageController { args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } + let mut envs = vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]; + + if let Some(posthog_config) = &self.config.posthog_config { + envs.push(( + "POSTHOG_CONFIG".to_string(), + serde_json::to_string(posthog_config)?, + )); + } + println!("Starting storage controller"); background_process::start_process( @@ -649,10 +661,7 @@ impl StorageController { &instance_dir, &self.env.storage_controller_bin(), args, - vec![ - ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), - ], + envs, background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)), &start_args.start_timeout, || async { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index cfb1190a27..76730c9ee6 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -63,7 +63,8 @@ impl Display for NodeMetadata { } } -/// PostHog integration config. +/// PostHog integration config. This is used in pageserver, storcon, and neon_local. +/// Ensure backward compatibility when adding new fields. #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PostHogConfig { /// PostHog project ID @@ -76,7 +77,9 @@ pub struct PostHogConfig { pub private_api_url: String, /// Public API URL pub public_api_url: String, - /// Refresh interval for the feature flag spec + /// Refresh interval for the feature flag spec. + /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive + /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub refresh_interval: Option, diff --git a/libs/posthog_client_lite/src/background_loop.rs b/libs/posthog_client_lite/src/background_loop.rs index dc813ccb4a..08cb0d2264 100644 --- a/libs/posthog_client_lite/src/background_loop.rs +++ b/libs/posthog_client_lite/src/background_loop.rs @@ -1,17 +1,22 @@ //! A background loop that fetches feature flags from PostHog and updates the feature store. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; use arc_swap::ArcSwap; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; -use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig}; +use crate::{ + CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig, +}; /// A background loop that fetches feature flags from PostHog and updates the feature store. pub struct FeatureResolverBackgroundLoop { posthog_client: PostHogClient, - feature_store: ArcSwap, + feature_store: ArcSwap<(SystemTime, Arc)>, cancel: CancellationToken, } @@ -19,11 +24,35 @@ impl FeatureResolverBackgroundLoop { pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self { Self { posthog_client: PostHogClient::new(config), - feature_store: ArcSwap::new(Arc::new(FeatureStore::new())), + feature_store: ArcSwap::new(Arc::new(( + SystemTime::UNIX_EPOCH, + Arc::new(FeatureStore::new()), + ))), cancel: shutdown_pageserver, } } + /// Update the feature store with a new feature flag spec bypassing the normal refresh loop. + pub fn update(&self, spec: String) -> anyhow::Result<()> { + let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?; + self.update_feature_store_nofail(resp, "http_propagate"); + Ok(()) + } + + fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) { + let project_id = self.posthog_client.config.project_id.parse::().ok(); + match FeatureStore::new_with_flags(resp.flags, project_id) { + Ok(feature_store) => { + self.feature_store + .store(Arc::new((SystemTime::now(), Arc::new(feature_store)))); + tracing::info!("Feature flag updated from {}", source); + } + Err(e) => { + tracing::warn!("Cannot process feature flag spec from {}: {}", source, e); + } + } + } + pub fn spawn( self: Arc, handle: &tokio::runtime::Handle, @@ -47,6 +76,17 @@ impl FeatureResolverBackgroundLoop { _ = ticker.tick() => {} _ = cancel.cancelled() => break } + { + let last_update = this.feature_store.load().0; + if let Ok(elapsed) = last_update.elapsed() { + if elapsed < refresh_period { + tracing::debug!( + "Skipping feature flag refresh because it's too soon" + ); + continue; + } + } + } let resp = match this .posthog_client .get_feature_flags_local_evaluation() @@ -58,16 +98,7 @@ impl FeatureResolverBackgroundLoop { continue; } }; - let project_id = this.posthog_client.config.project_id.parse::().ok(); - match FeatureStore::new_with_flags(resp.flags, project_id) { - Ok(feature_store) => { - this.feature_store.store(Arc::new(feature_store)); - tracing::info!("Feature flag updated"); - } - Err(e) => { - tracing::warn!("Cannot process feature flag spec: {}", e); - } - } + this.update_feature_store_nofail(resp, "refresh_loop"); } tracing::info!("PostHog feature resolver stopped"); } @@ -92,6 +123,6 @@ impl FeatureResolverBackgroundLoop { } pub fn feature_store(&self) -> Arc { - self.feature_store.load_full() + self.feature_store.load().1.clone() } } diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs index f21047bcfc..d042ee2410 100644 --- a/libs/posthog_client_lite/src/lib.rs +++ b/libs/posthog_client_lite/src/lib.rs @@ -544,17 +544,8 @@ impl PostHogClient { self.config.server_api_key.starts_with("phs_") } - /// Fetch the feature flag specs from the server. - /// - /// This is unfortunately an undocumented API at: - /// - - /// - - /// - /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. - /// See `_compute_flag_locally` in - pub async fn get_feature_flags_local_evaluation( - &self, - ) -> anyhow::Result { + /// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing. + pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result { // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation // with bearer token of self.server_api_key // OR @@ -588,7 +579,22 @@ impl PostHogClient { body )); } - Ok(serde_json::from_str(&body)?) + Ok(body) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> Result { + let raw = self.get_feature_flags_local_evaluation_raw().await?; + Ok(serde_json::from_str(&raw)?) } /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 3919a6e788..af4be23b9b 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -844,4 +844,13 @@ impl Client { .await .map_err(Error::ReceiveBody) } + + pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> { + let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint); + self.request(Method::POST, uri, spec) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index b0a68dfc4d..92a9ef2880 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -31,6 +31,13 @@ impl FeatureResolver { } } + pub fn update(&self, spec: String) -> anyhow::Result<()> { + if let Some(inner) = &self.inner { + inner.update(spec)?; + } + Ok(()) + } + pub fn spawn( conf: &PageServerConf, shutdown_pageserver: CancellationToken, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3755cbda6a..aa9bec657c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3743,6 +3743,20 @@ async fn force_override_feature_flag_for_testing_delete( json_response(StatusCode::OK, ()) } +async fn update_feature_flag_spec( + mut request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&request, None)?; + let body = json_request(&mut request).await?; + let state = get_state(&request); + state + .feature_resolver + .update(body) + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, ()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -4128,5 +4142,8 @@ pub fn make_router( .delete("/v1/feature_flag/:flag_key", |r| { testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete) }) + .post("/v1/feature_flag_spec", |r| { + api_handler(r, update_feature_flag_spec) + }) .any(handler_404)) } diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index c41e174d9d..3a0806b3b2 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -27,6 +27,7 @@ governor.workspace = true hex.workspace = true hyper0.workspace = true humantime.workspace = true +humantime-serde.workspace = true itertools.workspace = true json-structural-diff.workspace = true lasso.workspace = true @@ -34,6 +35,7 @@ once_cell.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true postgres_connection.workspace = true +posthog_client_lite.workspace = true rand.workspace = true reqwest = { workspace = true, features = ["stream"] } routerify.workspace = true diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index ff134a4ebc..296a98e620 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -14,11 +14,13 @@ use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; use metrics::BuildInfo; use metrics::launch_timestamp::LaunchTimestamp; +use pageserver_api::config::PostHogConfig; use reqwest::Certificate; use storage_controller::http::make_router; use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; +use storage_controller::service::feature_flag::FeatureFlagService; use storage_controller::service::{ Config, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, @@ -252,6 +254,8 @@ struct Secrets { peer_jwt_token: Option, } +const POSTHOG_CONFIG_ENV: &str = "POSTHOG_CONFIG"; + impl Secrets { const DATABASE_URL_ENV: &'static str = "DATABASE_URL"; const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN"; @@ -409,6 +413,18 @@ async fn async_main() -> anyhow::Result<()> { None => Vec::new(), }; + let posthog_config = if let Ok(json) = std::env::var(POSTHOG_CONFIG_ENV) { + let res: Result = serde_json::from_str(&json); + if let Ok(config) = res { + Some(config) + } else { + tracing::warn!("Invalid posthog config: {json}"); + None + } + } else { + None + }; + let config = Config { pageserver_jwt_token: secrets.pageserver_jwt_token, safekeeper_jwt_token: secrets.safekeeper_jwt_token, @@ -455,6 +471,7 @@ async fn async_main() -> anyhow::Result<()> { timelines_onto_safekeepers: args.timelines_onto_safekeepers, use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, + posthog_config: posthog_config.clone(), #[cfg(feature = "testing")] kick_secondary_downloads: args.kick_secondary_downloads, }; @@ -537,6 +554,23 @@ async fn async_main() -> anyhow::Result<()> { ) }); + let feature_flag_task = if let Some(posthog_config) = posthog_config { + let service = service.clone(); + let cancel = CancellationToken::new(); + let cancel_bg = cancel.clone(); + let task = tokio::task::spawn( + async move { + let feature_flag_service = FeatureFlagService::new(service, posthog_config); + let feature_flag_service = Arc::new(feature_flag_service); + feature_flag_service.run(cancel_bg).await + } + .instrument(tracing::info_span!("feature_flag_service")), + ); + Some((task, cancel)) + } else { + None + }; + // Wait until we receive a signal let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?; let mut sigquit = tokio::signal::unix::signal(SignalKind::quit())?; @@ -584,6 +618,12 @@ async fn async_main() -> anyhow::Result<()> { chaos_jh.await.ok(); } + // If we were running the feature flag service, stop that so that we're not calling into Service while it shuts down + if let Some((feature_flag_task, feature_flag_cancel)) = feature_flag_task { + feature_flag_cancel.cancel(); + feature_flag_task.await.ok(); + } + service.shutdown().await; tracing::info!("Service shutdown complete"); diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 817409e112..d6fe173eb3 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -376,4 +376,13 @@ impl PageserverClient { .await ) } + + pub(crate) async fn update_feature_flag_spec(&self, spec: String) -> Result<()> { + measured_request!( + "update_feature_flag_spec", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner.update_feature_flag_spec(spec).await + ) + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 8424c27cf8..b4dfd01249 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,5 +1,6 @@ pub mod chaos_injector; mod context_iterator; +pub mod feature_flag; pub(crate) mod safekeeper_reconciler; mod safekeeper_service; @@ -25,6 +26,7 @@ use futures::stream::FuturesUnordered; use http_utils::error::ApiError; use hyper::Uri; use itertools::Itertools; +use pageserver_api::config::PostHogConfig; use pageserver_api::controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, @@ -471,6 +473,9 @@ pub struct Config { /// Safekeepers will be choosen from different availability zones. pub timeline_safekeeper_count: i64, + /// PostHog integration config + pub posthog_config: Option, + #[cfg(feature = "testing")] pub kick_secondary_downloads: bool, } diff --git a/storage_controller/src/service/feature_flag.rs b/storage_controller/src/service/feature_flag.rs new file mode 100644 index 0000000000..645eb75237 --- /dev/null +++ b/storage_controller/src/service/feature_flag.rs @@ -0,0 +1,117 @@ +use std::{sync::Arc, time::Duration}; + +use futures::StreamExt; +use pageserver_api::config::PostHogConfig; +use pageserver_client::mgmt_api; +use posthog_client_lite::{PostHogClient, PostHogClientConfig}; +use reqwest::StatusCode; +use tokio::time::MissedTickBehavior; +use tokio_util::sync::CancellationToken; + +use crate::{pageserver_client::PageserverClient, service::Service}; + +pub struct FeatureFlagService { + service: Arc, + config: PostHogConfig, + client: PostHogClient, + http_client: reqwest::Client, +} + +const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(30); + +impl FeatureFlagService { + pub fn new(service: Arc, config: PostHogConfig) -> Self { + let client = PostHogClient::new(PostHogClientConfig { + project_id: config.project_id.clone(), + server_api_key: config.server_api_key.clone(), + client_api_key: config.client_api_key.clone(), + private_api_url: config.private_api_url.clone(), + public_api_url: config.public_api_url.clone(), + }); + Self { + service, + config, + client, + http_client: reqwest::Client::new(), + } + } + + async fn refresh(self: Arc, cancel: CancellationToken) -> Result<(), anyhow::Error> { + let nodes = { + let inner = self.service.inner.read().unwrap(); + inner.nodes.clone() + }; + + let feature_flag_spec = self.client.get_feature_flags_local_evaluation_raw().await?; + let stream = futures::stream::iter(nodes.values().cloned()).map(|node| { + let this = self.clone(); + let feature_flag_spec = feature_flag_spec.clone(); + async move { + let res = async { + let client = PageserverClient::new( + node.get_id(), + this.http_client.clone(), + node.base_url(), + // TODO: what if we rotate the token during storcon lifetime? + this.service.config.pageserver_jwt_token.as_deref(), + ); + + client.update_feature_flag_spec(feature_flag_spec).await?; + tracing::info!( + "Updated {}({}) with feature flag spec", + node.get_id(), + node.base_url() + ); + Ok::<_, mgmt_api::Error>(()) + }; + + if let Err(e) = res.await { + if let mgmt_api::Error::ApiError(status, _) = e { + if status == StatusCode::NOT_FOUND { + // This is expected during deployments where the API is not available, so we can ignore it + return; + } + } + tracing::warn!( + "Failed to update feature flag spec for {}: {e}", + node.get_id() + ); + } + } + }); + let mut stream = stream.buffer_unordered(8); + + while stream.next().await.is_some() { + if cancel.is_cancelled() { + return Ok(()); + } + } + + Ok(()) + } + + pub async fn run(self: Arc, cancel: CancellationToken) { + let refresh_interval = self + .config + .refresh_interval + .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL); + let mut interval = tokio::time::interval(refresh_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + tracing::info!( + "Starting feature flag service with refresh interval: {:?}", + refresh_interval + ); + loop { + tokio::select! { + _ = interval.tick() => {} + _ = cancel.cancelled() => { + break; + } + } + let res = self.clone().refresh(cancel.clone()).await; + if let Err(e) = res { + tracing::error!("Failed to refresh feature flags: {e:#?}"); + } + } + } +} From 1d49eefbbba840b176e1ea57285e6fb3cdf851e7 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 25 Jun 2025 18:25:57 +0200 Subject: [PATCH 18/50] RFC: Endpoint Persistent Unlogged Files Storage (#9661) ## Summary A design for a storage system that allows storage of files required to make Neon's Endpoints have a better experience at or after a reboot. ## Motivation Several systems inside PostgreSQL (and Neon) need some persistent storage for optimal workings across reboots and restarts, but still work without. Examples are the cumulative statistics file in `pg_stat/global.stat`, `pg_stat_statements`' `pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. We need a storage system that can store and manage these files for each Endpoint. [GH rendered file](https://github.com/neondatabase/neon/blob/MMeent/rfc-unlogged-file/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md) Part of https://github.com/neondatabase/cloud/issues/24225 --- ...point-Persistent-Unlogged-Files-Storage.md | 396 ++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md diff --git a/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md new file mode 100644 index 0000000000..182b2682af --- /dev/null +++ b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md @@ -0,0 +1,396 @@ +# Memo: Endpoint Persistent Unlogged Files Storage +Created on 2024-11-05 +Implemented on N/A + +## Summary +A design for a storage system that allows storage of files required to make +Neon's Endpoints have a better experience at or after a reboot. + +## Motivation +Several systems inside PostgreSQL (and Neon) need some persistent storage for +optimal workings across reboots and restarts, but still work without. +Examples are the query-level statistics files of `pg_stat_statements` in +`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`. +We need a storage system that can store and manage these files for each +Endpoint, without necessarily granting users access to an unlimited storage +device. + +## Goals +- Store known files for Endpoints with reasonable persistence. + _Data loss in this service, while annoying and bad for UX, won't lose any + customer's data._ + +## Non Goals (if relevant) +- This storage system does not need branching, file versioning, or other such + features. The files are as ephemeral to the timeline of the data as the + Endpoints that host the data. +- This storage system does not need to store _all_ user files, only 'known' + user files. +- This storage system does not need to be hosted fully inside Computes. + _Instead, this will be a separate component similar to Pageserver, + SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._ + +## Impacted components +- Compute needs new code to load and store these files in its lifetime. +- Control Plane needs to consider this new storage system when signalling + the deletion of an Endpoint, Timeline, or Tenant. +- Control Plane needs to consider this new storage system when it resets + or re-assigns an endpoint's timeline/branch state. + +A new service is created: the Endpoint Persistent Unlogged Files Storage +service. This could be integrated in e.g. Pageserver or Control Plane, or a +separately hosted service. + +## Proposed implementation +Endpoint-related data files are managed by a newly designed service (which +optionally is integrated in an existing service like Pageserver or Control +Plane), which stores data directly into S3 or any blob storage of choice. + +Upon deletion of the Endpoint, or reassignment of the endpoint to a different +branch, this ephemeral data is dropped: the data stored may not match the +state of the branch's data after reassignment, and on endpoint deletion the +data won't have any use to the user. + +Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims) +which it can use to authenticate to this new service and retrieve and store +data associated with this endpoint. This limited scope reduces leaks of data +across endpoints and timeline resets, and limits the ability of endpoints to +mess with other endpoints' data. + +The path of this endpoint data in S3 is initially as follows: + + s3:/// + tenants/ + / + tenants/ + / + endpoints/ + / + pgdata/ + + +For other blob storages an equivalent or similar path can be constructed. + +### Reliability, failure modes and corner cases (if relevant) +Reliability is important, but not critical to the workings of Neon. The data +stored in this service will, when lost, reduce performance, but won't be a +cause of permanent data loss - only operational metadata is stored. + +Most, if not all, blob storage services have sufficiently high persistence +guarantees to cater our need for persistence and uptime. The only concern with +blob storages is that the access latency is generally higher than local disk, +but for the object types stored (cache state, ...) I don't think this will be +much of an issue. + +### Interaction/Sequence diagram (if relevant) + +In these diagrams you can replace S3 with any persistent storage device of +choice, but S3 is chosen as representative name: The well-known and short name +of AWS' blob storage. Azure Blob Storage should work too, but it has a much +longer name making it less practical for the diagrams. + +Write data: + +```http +POST /tenants//timelines//endpoints//pgdata/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "version": "", # opaque file version token, changes when the file contents change + "size": , +} +``` + +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ep as EPUFS + participant s3 as Blob Storage + + co-->ep: Connect with credentials + co->>+ep: Store Unlogged Persistent File + opt is authenticated + ep->>s3: Write UPF to S3 + end + ep->>-co: OK / Failure / Auth Failure + co-->ep: Cancel connection +``` + +Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since) +```http +GET /tenants//timelines//endpoints//pgdata/ +Host: epufs.svc.neon.local + +<<< + +200 OK + + +``` + +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ep as EPUFS + participant s3 as Blob Storage + + co->>+ep: Read Unlogged Persistent File + opt is authenticated + ep->>+s3: Request UPF from storage + s3->>-ep: Receive UPF from storage + end + ep->>-co: OK(response) / Failure(storage, auth, ...) +``` + +Compute Startup: +```mermaid +sequenceDiagram + autonumber + participant co as Compute + participant ps as Pageserver + participant ep as EPUFS + participant es as Extension server + + note over co: Bind endpoint ep-xxx + par Get basebackup + co->>+ps: Request basebackup @ LSN + ps-)ps: Construct basebackup + ps->>-co: Receive basebackup TAR @ LSN + and Get startup-critical Unlogged Persistent Files + co->>+ep: Get all UPFs of endpoint ep-xxx + ep-)ep: Retrieve and gather all UPFs + ep->>-co: TAR of UPFs + and Get startup-critical extensions + loop For every startup-critical extension + co->>es: Get critical extension + es->>co: Receive critical extension + end + end + note over co: Start compute +``` + +CPlane ops: +```http +DELETE /tenants//timelines//endpoints/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "timeline": "", + "endpoint": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```http +DELETE /tenants//timelines/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "timeline": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```http +DELETE /tenants/ +Host: epufs.svc.neon.local + +<<< + +200 OK +{ + "tenant": "", + "deleted": { + "files": , + "bytes": , + }, +} +``` + +```mermaid +sequenceDiagram + autonumber + participant cp as Control Plane + participant ep as EPUFS + participant s3 as Blob Storage + + alt Tenant deleted + cp-)ep: Tenant deleted + loop For every object associated with removed tenant + ep->>s3: Remove data of deleted tenant from Storage + end + opt + ep-)cp: Tenant cleanup complete + end + alt Timeline deleted + cp-)ep: Timeline deleted + loop For every object associated with removed timeline + ep->>s3: Remove data of deleted timeline from Storage + end + opt + ep-)cp: Timeline cleanup complete + end + else Endpoint reassigned or removed + cp->>+ep: Endpoint reassigned + loop For every object associated with reassigned/removed endpoint + ep->>s3: Remove data from Storage + end + ep->>-cp: Cleanup complete + end +``` + +### Scalability (if relevant) + +Provisionally: As this service is going to be part of compute startup, this +service should be able to quickly respond to all requests. Therefore this +service is deployed to every AZ we host Computes in, and Computes communicate +(generally) only to the EPUFS endpoint of the AZ they're hosted in. + +Local caching of frequently restarted endpoints' data or metadata may be +needed for best performance. However, due to the regional nature of stored +data but zonal nature of the service deployment, we should be careful when we +implement any local caching, as it is possible that computes in AZ 1 will +update data originally written and thus cached by AZ 2. Cache version tests +and invalidation is therefore required if we want to roll out caching to this +service, which is too broad a scope for an MVC. This is why caching is left +out of scope for this RFC, and should be considered separately after this RFC +is implemented. + +### Security implications (if relevant) +This service must be able to authenticate users at least by Tenant ID, +Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of +Compute, which will be upgraded to the extent needed to support Timeline- and +Endpoint-based claims. + +The service requires unlimited access to (a prefix of) a blob storage bucket, +and thus must be hosted outside the Compute VM sandbox. + +A service that generates pre-signed request URLs for Compute to download the +data from that URL is likely problematic, too: Compute would be able to write +unlimited data to the bucket, or exfiltrate this signed URL to get read/write +access to specific objects in this bucket, which would still effectively give +users access to the S3 bucket (but with improved access logging). + +There may be a use case for transferring data associated with one endpoint to +another endpoint (e.g. to make one endpoint warm its caches with the state of +another endpoint), but that's not currently in scope, and specific needs may +be solved through out-of-line communication of data or pre-signed URLs. + +### Unresolved questions (if relevant) +Caching of files is not in the implementation scope of the document, but +should at some future point be considered to maximize performance. + +## Alternative implementation (if relevant) +Several ideas have come up to solve this issue: + +### Use AUXfile +One prevalent idea was to WAL-log the files using our AUXfile mechanism. + +Benefits: + ++ We already have this storage mechanism + +Demerits: + +- It isn't available on read replicas +- Additional WAL will be consumed during shutdown and after the shutdown + checkpoint, which needs PG modifications to work without panics. +- It increases the data we need to manage in our versioned storage, thus + causing higher storage costs with higher retention due to duplication at + the storage layer. + +### Sign URLs for read/write operations, instead of proxying them + +Benefits: + ++ The service can be implemented with a much reduced IO budget + +Demerits: + +- Users could get access to these signed credentials +- Not all blob storage services may implement URL signing + +### Give endpoints each their own directly accessed block volume + +Benefits: + ++ Easier to integrate for PostgreSQL + +Demerits: + +- Little control on data size and contents +- Potentially problematic as we'd need to store data all across the pgdata + directory. +- EBS is not a good candidate + - Attaches in 10s of seconds, if not more; i.e. too cold to start + - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint + with users of the same EBS volumes, which can't work with VM migration + - EBS storage costs are very high (>80$/kilotenant when using a + volume/tenant) + - EBS volumes can't be mounted across AZ boundaries +- Bucket per endpoint is unfeasible + - S3 buckets are priced at $20/month per 1k, which we could better spend + on developers. + - Allocating service accounts takes time (100s of ms), and service accounts + are a limited resource, too; so they're not a good candidate to allocate + on a per-endpoint basis. + - Giving credentials limited to prefix has similar issues as the pre-signed + URL approach. + - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup + much more than our current systems would. +- Volumes bound by hypervisor are unlikely + - This requires significant investment and increased software on the + hypervisor. + - It is unclear if we can attach volumes after boot, i.e. for pooled + instances. + +### Put the files into a table + +Benefits: + + + Mostly already available in PostgreSQL + +Demerits: + + - Uses WAL + - Can't be used after shutdown checkpoint + - Needs a RW endpoint, and table & catalog access to write to this data + - Gets hit with DB size limitations + - Depending on user acces: + - Inaccessible: + The user doesn't have control over database size caused by + these systems. + - Accessible: + The user can corrupt these files and cause the system to crash while + user-corrupted files are present, thus increasing on-call overhead. + +## Definition of Done (if relevant) + +This project is done if we have: + +- One S3 bucket equivalent per region, which stores this per-endpoint data. +- A new service endpoint in at least every AZ, which indirectly grants + endpoints access to the data stored for these endpoints in these buckets. +- Compute writes & reads temp-data at shutdown and startup, respectively, for + at least the pg_prewarm or lfc_prewarm state files. +- Cleanup of endpoint data is triggered when the endpoint is deleted or is + detached from its current timeline. From f755979102f4a5086253e5eaab5e48d8af7995cc Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 25 Jun 2025 20:16:23 +0200 Subject: [PATCH 19/50] pageserver: payload compression for gRPC base backups (#12346) ## Problem gRPC base backups use gRPC compression. However, this has two problems: * Base backup caching will cache compressed base backups (making gRPC compression pointless). * Tonic does not support varying the compression level, and zstd default level is 10% slower than gzip fastest level. Touches https://github.com/neondatabase/neon/issues/11728. Touches https://github.com/neondatabase/cloud/issues/29353. ## Summary of changes This patch adds a gRPC parameter `BaseBackupRequest::compression` specifying the compression algorithm. It also moves compression into `send_basebackup_tarball` to reduce code duplication. A follow-up PR will integrate the base backup cache with gRPC. --- pageserver/page_api/proto/page_service.proto | 13 ++++ pageserver/page_api/src/client.rs | 1 - pageserver/page_api/src/model.rs | 59 ++++++++++++++- pageserver/pagebench/src/cmd/basebackup.rs | 10 ++- pageserver/src/basebackup.rs | 77 +++++++++++++------- pageserver/src/basebackup_cache.rs | 14 +--- pageserver/src/page_service.rs | 44 +++++------ 7 files changed, 149 insertions(+), 69 deletions(-) diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index 81953a710f..d06b2cfca5 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -110,6 +110,19 @@ message GetBaseBackupRequest { bool replica = 2; // If true, include relation files in the base backup. Mainly for debugging and tests. bool full = 3; + // Compression algorithm to use. Base backups send a compressed payload instead of using gRPC + // compression, so that we can cache compressed backups on the server. + BaseBackupCompression compression = 4; +} + +// Base backup compression algorithms. +enum BaseBackupCompression { + // Unknown algorithm. Used when clients send an unsupported algorithm. + BASE_BACKUP_COMPRESSION_UNKNOWN = 0; + // No compression. + BASE_BACKUP_COMPRESSION_NONE = 1; + // GZIP compression. + BASE_BACKUP_COMPRESSION_GZIP = 2; } // Base backup response chunk, returned as an ordered stream. diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 3977ce7c23..71d539ab91 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -95,7 +95,6 @@ impl Client { if let Some(compression) = compression { // TODO: benchmark this (including network latency). - // TODO: consider enabling compression by default. client = client .accept_compressed(compression) .send_compressed(compression); diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 6efa742799..1ca89b4870 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -191,15 +191,21 @@ pub struct GetBaseBackupRequest { pub replica: bool, /// If true, include relation files in the base backup. Mainly for debugging and tests. pub full: bool, + /// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC + /// compression, so that we can cache compressed backups on the server. + pub compression: BaseBackupCompression, } -impl From for GetBaseBackupRequest { - fn from(pb: proto::GetBaseBackupRequest) -> Self { - Self { +impl TryFrom for GetBaseBackupRequest { + type Error = ProtocolError; + + fn try_from(pb: proto::GetBaseBackupRequest) -> Result { + Ok(Self { lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)), replica: pb.replica, full: pb.full, - } + compression: pb.compression.try_into()?, + }) } } @@ -209,10 +215,55 @@ impl From for proto::GetBaseBackupRequest { lsn: request.lsn.unwrap_or_default().0, replica: request.replica, full: request.full, + compression: request.compression.into(), } } } +/// Base backup compression algorithm. +#[derive(Clone, Copy, Debug)] +pub enum BaseBackupCompression { + None, + Gzip, +} + +impl TryFrom for BaseBackupCompression { + type Error = ProtocolError; + + fn try_from(pb: proto::BaseBackupCompression) -> Result { + match pb { + proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)), + proto::BaseBackupCompression::None => Ok(Self::None), + proto::BaseBackupCompression::Gzip => Ok(Self::Gzip), + } + } +} + +impl TryFrom for BaseBackupCompression { + type Error = ProtocolError; + + fn try_from(compression: i32) -> Result { + proto::BaseBackupCompression::try_from(compression) + .map_err(|_| ProtocolError::invalid("compression", compression)) + .and_then(Self::try_from) + } +} + +impl From for proto::BaseBackupCompression { + fn from(compression: BaseBackupCompression) -> Self { + match compression { + BaseBackupCompression::None => Self::None, + BaseBackupCompression::Gzip => Self::Gzip, + } + } +} + +impl From for i32 { + fn from(compression: BaseBackupCompression) -> Self { + proto::BaseBackupCompression::from(compression).into() + } +} + pub type GetBaseBackupResponseChunk = Bytes; impl TryFrom for GetBaseBackupResponseChunk { diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index e028174c1d..4111d09f92 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -317,6 +317,7 @@ impl Client for LibpqClient { /// A gRPC Pageserver client. struct GrpcClient { inner: page_api::Client, + compression: page_api::BaseBackupCompression, } impl GrpcClient { @@ -331,10 +332,14 @@ impl GrpcClient { ttid.timeline_id, ShardIndex::unsharded(), None, - compression.then_some(tonic::codec::CompressionEncoding::Zstd), + None, // NB: uses payload compression ) .await?; - Ok(Self { inner }) + let compression = match compression { + true => page_api::BaseBackupCompression::Gzip, + false => page_api::BaseBackupCompression::None, + }; + Ok(Self { inner, compression }) } } @@ -348,6 +353,7 @@ impl Client for GrpcClient { lsn, replica: false, full: false, + compression: self.compression, }; let stream = self.inner.get_base_backup(req).await?; Ok(Box::pin(StreamReader::new( diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 115f0d9ebc..36dada1e89 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -14,6 +14,7 @@ use std::fmt::Write as FmtWrite; use std::time::{Instant, SystemTime}; use anyhow::{Context, anyhow}; +use async_compression::tokio::write::GzipEncoder; use bytes::{BufMut, Bytes, BytesMut}; use fail::fail_point; use pageserver_api::key::{Key, rel_block_to_key}; @@ -25,8 +26,7 @@ use postgres_ffi::{ }; use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM}; -use tokio::io; -use tokio::io::AsyncWrite; +use tokio::io::{self, AsyncWrite, AsyncWriteExt as _}; use tokio_tar::{Builder, EntryType, Header}; use tracing::*; use utils::lsn::Lsn; @@ -97,6 +97,7 @@ impl From for tonic::Status { /// * When working without safekeepers. In this situation it is important to match the lsn /// we are taking basebackup on with the lsn that is used in pageserver's walreceiver /// to start the replication. +#[allow(clippy::too_many_arguments)] pub async fn send_basebackup_tarball<'a, W>( write: &'a mut W, timeline: &'a Timeline, @@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>( prev_lsn: Option, full_backup: bool, replica: bool, + gzip_level: Option, ctx: &'a RequestContext, ) -> Result<(), BasebackupError> where @@ -122,7 +124,7 @@ where // prev_lsn value; that happens if the timeline was just branched from // an old LSN and it doesn't have any WAL of its own yet. We will set // prev_lsn to Lsn(0) if we cannot provide the correct value. - let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { + let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. The caller should've // already checked that it's a valid LSN. @@ -143,7 +145,7 @@ where }; // Consolidate the derived and the provided prev_lsn values - let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn { + let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn { if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn { return Err(BasebackupError::Server(anyhow!( "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}" @@ -155,30 +157,55 @@ where }; info!( - "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})", - backup_lsn, prev_lsn, full_backup, replica + "taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \ + (full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})", + ); + let span = info_span!("send_tarball", backup_lsn=%lsn); + + let io_concurrency = IoConcurrency::spawn_from_conf( + timeline.conf.get_vectored_concurrent_io, + timeline + .gate + .enter() + .map_err(|_| BasebackupError::Shutdown)?, ); - let basebackup = Basebackup { - ar: Builder::new_non_terminated(write), - timeline, - lsn: backup_lsn, - prev_record_lsn: prev_lsn, - full_backup, - replica, - ctx, - io_concurrency: IoConcurrency::spawn_from_conf( - timeline.conf.get_vectored_concurrent_io, - timeline - .gate - .enter() - .map_err(|_| BasebackupError::Shutdown)?, - ), - }; - basebackup + if let Some(gzip_level) = gzip_level { + let mut encoder = GzipEncoder::with_quality(write, gzip_level); + Basebackup { + ar: Builder::new_non_terminated(&mut encoder), + timeline, + lsn, + prev_record_lsn, + full_backup, + replica, + ctx, + io_concurrency, + } .send_tarball() - .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn)) - .await + .instrument(span) + .await?; + encoder + .shutdown() + .await + .map_err(|err| BasebackupError::Client(err, "gzip"))?; + } else { + Basebackup { + ar: Builder::new_non_terminated(write), + timeline, + lsn, + prev_record_lsn, + full_backup, + replica, + ctx, + io_concurrency, + } + .send_tarball() + .instrument(span) + .await?; + } + + Ok(()) } /// This is short-living object only for the time of tarball creation, diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 24f6413380..69438dae7f 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -1,7 +1,6 @@ use std::{collections::HashMap, sync::Arc}; use anyhow::Context; -use async_compression::tokio::write::GzipEncoder; use camino::{Utf8Path, Utf8PathBuf}; use metrics::core::{AtomicU64, GenericCounter}; use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; @@ -594,13 +593,6 @@ impl BackgroundTask { let file = tokio::fs::File::create(entry_tmp_path).await?; let mut writer = BufWriter::new(file); - let mut encoder = GzipEncoder::with_quality( - &mut writer, - // Level::Best because compression is not on the hot path of basebackup requests. - // The decompression is almost not affected by the compression level. - async_compression::Level::Best, - ); - // We may receive a request before the WAL record is applied to the timeline. // Wait for the requested LSN to be applied. timeline @@ -613,17 +605,19 @@ impl BackgroundTask { .await?; send_basebackup_tarball( - &mut encoder, + &mut writer, timeline, Some(req_lsn), None, false, false, + // Level::Best because compression is not on the hot path of basebackup requests. + // The decompression is almost not affected by the compression level. + Some(async_compression::Level::Best), &ctx, ) .await?; - encoder.shutdown().await?; writer.flush().await?; writer.into_inner().sync_all().await?; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index d3a1ca681e..dd02947e5c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -13,7 +13,6 @@ use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; use anyhow::{Context as _, anyhow, bail}; -use async_compression::tokio::write::GzipEncoder; use bytes::{Buf as _, BufMut as _, BytesMut}; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; @@ -2613,6 +2612,7 @@ impl PageServerHandler { prev_lsn, full_backup, replica, + None, &ctx, ) .await?; @@ -2641,31 +2641,6 @@ impl PageServerHandler { .map_err(|err| { BasebackupError::Client(err, "handle_basebackup_request,cached,copy") })?; - } else if gzip { - let mut encoder = GzipEncoder::with_quality( - &mut writer, - // NOTE using fast compression because it's on the critical path - // for compute startup. For an empty database, we get - // <100KB with this method. The Level::Best compression method - // gives us <20KB, but maybe we should add basebackup caching - // on compute shutdown first. - async_compression::Level::Fastest, - ); - basebackup::send_basebackup_tarball( - &mut encoder, - &timeline, - lsn, - prev_lsn, - full_backup, - replica, - &ctx, - ) - .await?; - // shutdown the encoder to ensure the gzip footer is written - encoder - .shutdown() - .await - .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?; } else { basebackup::send_basebackup_tarball( &mut writer, @@ -2674,6 +2649,11 @@ impl PageServerHandler { prev_lsn, full_backup, replica, + // NB: using fast compression because it's on the critical path for compute + // startup. For an empty database, we get <100KB with this method. The + // Level::Best compression method gives us <20KB, but maybe we should add + // basebackup caching on compute shutdown first. + gzip.then_some(async_compression::Level::Fastest), &ctx, ) .await?; @@ -3553,7 +3533,7 @@ impl proto::PageService for GrpcPageServiceHandler { if timeline.is_archived() == Some(true) { return Err(tonic::Status::failed_precondition("timeline is archived")); } - let req: page_api::GetBaseBackupRequest = req.into_inner().into(); + let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?; span_record!(lsn=?req.lsn); @@ -3579,6 +3559,15 @@ impl proto::PageService for GrpcPageServiceHandler { let span = Span::current(); let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE); let jh = tokio::spawn(async move { + let gzip_level = match req.compression { + page_api::BaseBackupCompression::None => None, + // NB: using fast compression because it's on the critical path for compute + // startup. For an empty database, we get <100KB with this method. The + // Level::Best compression method gives us <20KB, but maybe we should add + // basebackup caching on compute shutdown first. + page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest), + }; + let result = basebackup::send_basebackup_tarball( &mut simplex_write, &timeline, @@ -3586,6 +3575,7 @@ impl proto::PageService for GrpcPageServiceHandler { None, req.full, req.replica, + gzip_level, &ctx, ) .instrument(span) // propagate request span From 6f70885e11f35017ba32fa2d044b271b8cd97fde Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 25 Jun 2025 18:15:03 -0400 Subject: [PATCH 20/50] fix(pageserver): allow refresh_interval to be empty (#12349) ## Problem Fix for https://github.com/neondatabase/neon/pull/12324 ## Summary of changes Need `serde(default)` to allow this field not present in the config, otherwise there will be a config deserialization error. --------- Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/config.rs | 1 + pageserver/src/config.rs | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 76730c9ee6..7926e839cf 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -80,6 +80,7 @@ pub struct PostHogConfig { /// Refresh interval for the feature flag spec. /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. + #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] pub refresh_interval: Option, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 9952496061..5b51a9617b 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -762,4 +762,23 @@ mod tests { let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir); assert_eq!(result.is_ok(), is_valid); } + + #[test] + fn test_config_posthog_config_is_valid() { + let input = r#" + control_plane_api = "http://localhost:6666" + + [posthog_config] + server_api_key = "phs_AAA" + client_api_key = "phc_BBB" + project_id = "000" + private_api_url = "https://us.posthog.com" + public_api_url = "https://us.i.posthog.com" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("posthogconfig is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); + } } From be23eae3b622aac0a76ad95b0a6421c9870c39a6 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 26 Jun 2025 10:06:27 +0300 Subject: [PATCH 21/50] Mark pages as avaiable in LFC only after generation check (#12350) ## Problem If LFC generation is changed then `lfc_readv_select` will return -1 but pages are still marked as available in bitmap. ## Summary of changes Update bitmap after generation check. Co-authored-by: Kosntantin Knizhnik --- pgxn/neon/file_cache.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 45a4695495..8cfa09bc87 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -1295,7 +1295,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { - /* chunk offset (# of pages) into the LFC file */ + /* chunk offset (# + of pages) into the LFC file */ off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk; int nwrite = iov_last_used - first_block_in_chunk_read; /* offset of first IOV */ @@ -1313,16 +1314,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_disable("read"); return -1; } - - /* - * We successfully read the pages we know were valid when we - * started reading; now mark those pages as read - */ - for (int i = first_block_in_chunk_read; i < iov_last_used; i++) - { - if (BITMAP_ISSET(chunk_mask, i)) - BITMAP_SET(mask, buf_offset + i); - } } /* Place entry to the head of LRU list */ @@ -1340,6 +1331,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, { lfc_ctl->time_read += io_time_us; inc_page_cache_read_wait(io_time_us); + /* + * We successfully read the pages we know were valid when we + * started reading; now mark those pages as read + */ + for (int i = first_block_in_chunk_read; i < iov_last_used; i++) + { + if (BITMAP_ISSET(chunk_mask, i)) + BITMAP_SET(mask, buf_offset + i); + } } CriticalAssert(entry->access_count > 0); From fd1e8ec257a55cb9e5e6b97ec1007adab354dcfe Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 26 Jun 2025 12:25:41 +0100 Subject: [PATCH 22/50] [proxy] review and cleanup CLI args (#12167) I was looking at how we could expose our proxy config as toml again, and as I was writing out the schema format, I noticed some cruft in our CLI args that no longer seem to be in use. The redis change is the most complex, but I am pretty sure it's sound. Since https://github.com/neondatabase/cloud/pull/15613 cplane longer publishes to the global redis instance. --- proxy/src/binary/local_proxy.rs | 1 - proxy/src/binary/pg_sni_router.rs | 1 - proxy/src/binary/proxy.rs | 117 +++++++++------------------- proxy/src/config.rs | 1 - proxy/src/console_redirect_proxy.rs | 7 +- proxy/src/context/mod.rs | 12 +-- proxy/src/context/parquet.rs | 26 +++++-- proxy/src/proxy/mod.rs | 7 +- proxy/src/rate_limiter/limiter.rs | 6 -- proxy/src/redis/kv_ops.rs | 14 +--- proxy/src/redis/notifications.rs | 23 ++---- proxy/src/serverless/mod.rs | 14 +--- 12 files changed, 70 insertions(+), 159 deletions(-) diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index e3be454713..423ecf821e 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -279,7 +279,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), - region: "local".into(), wake_compute_retry_config: RetryConfig::parse(RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)?, connect_compute_locks, connect_to_compute: compute_config, diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 481bd8501c..070c73cdcf 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -236,7 +236,6 @@ pub(super) async fn task_main( extra: None, }, crate::metrics::Protocol::SniRouter, - "sni", ); handle_client(ctx, dest_suffix, tls_config, compute_tls_config, socket).await } diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 9215dbf73f..9ead05d492 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -123,12 +123,6 @@ struct ProxyCliArgs { /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, - /// http endpoint to receive periodic metric updates - #[clap(long)] - metric_collection_endpoint: Option, - /// how often metrics should be sent to a collection endpoint - #[clap(long)] - metric_collection_interval: Option, /// cache for `wake_compute` api method (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] wake_compute_cache: String, @@ -155,40 +149,31 @@ struct ProxyCliArgs { /// Wake compute rate limiter max number of requests per second. #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)] wake_compute_limit: Vec, - /// Redis rate limiter max number of requests per second. - #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] - redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, /// Cancellation ops batch size for redis #[clap(long, default_value_t = 8)] cancellation_batch_size: usize, - /// cache for `allowed_ips` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - allowed_ips_cache: String, - /// cache for `role_secret` (use `size=0` to disable) - #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] - role_secret_cache: String, - /// redis url for notifications (if empty, redis_host:port will be used for both notifications and streaming connections) - #[clap(long)] - redis_notifications: Option, - /// what from the available authentications type to use for the regional redis we have. Supported are "irsa" and "plain". + /// redis url for plain authentication + #[clap(long, alias("redis-notifications"))] + redis_plain: Option, + /// what from the available authentications type to use for redis. Supported are "irsa" and "plain". #[clap(long, default_value = "irsa")] redis_auth_type: String, - /// redis host for streaming connections (might be different from the notifications host) + /// redis host for irsa authentication #[clap(long)] redis_host: Option, - /// redis port for streaming connections (might be different from the notifications host) + /// redis port for irsa authentication #[clap(long)] redis_port: Option, - /// redis cluster name, used in aws elasticache + /// redis cluster name for irsa authentication #[clap(long)] redis_cluster_name: Option, - /// redis user_id, used in aws elasticache + /// redis user_id for irsa authentication #[clap(long)] redis_user_id: Option, - /// aws region to retrieve credentials + /// aws region for irsa authentication #[clap(long, default_value_t = String::new())] aws_region: String, /// cache for `project_info` (use `size=0` to disable) @@ -200,6 +185,12 @@ struct ProxyCliArgs { #[clap(flatten)] parquet_upload: ParquetUploadArgs, + /// http endpoint to receive periodic metric updates + #[clap(long)] + metric_collection_endpoint: Option, + /// how often metrics should be sent to a collection endpoint + #[clap(long)] + metric_collection_interval: Option, /// interval for backup metric collection #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] metric_backup_collection_interval: std::time::Duration, @@ -212,6 +203,7 @@ struct ProxyCliArgs { /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] metric_backup_collection_chunk_size: usize, + /// Whether to retry the connection to the compute node #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)] connect_to_compute_retry: String, @@ -331,7 +323,7 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; + let redis_client = configure_redis(&args).await?; // Check that we can bind to address before further initialization info!("Starting http on {}", args.http); @@ -386,13 +378,6 @@ pub async fn run() -> anyhow::Result<()> { let cancellation_token = CancellationToken::new(); - let redis_rps_limit = Vec::leak(args.redis_rps_limit.clone()); - RateBucketInfo::validate(redis_rps_limit)?; - - let redis_kv_client = regional_redis_client - .as_ref() - .map(|redis_publisher| RedisKVClient::new(redis_publisher.clone(), redis_rps_limit)); - let cancellation_handler = Arc::new(CancellationHandler::new(&config.connect_to_compute)); let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new_with_shards( @@ -472,6 +457,7 @@ pub async fn run() -> anyhow::Result<()> { client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, + args.region, )); // maintenance tasks. these never return unless there's an error @@ -495,32 +481,17 @@ pub async fn run() -> anyhow::Result<()> { #[cfg_attr(not(any(test, feature = "testing")), expect(irrefutable_let_patterns))] if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend { if let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { - match (redis_notifications_client, regional_redis_client.clone()) { - (None, None) => {} - (client1, client2) => { - let cache = api.caches.project_info.clone(); - if let Some(client) = client1 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - if let Some(client) = client2 { - maintenance_tasks.spawn(notifications::task_main( - client, - cache.clone(), - args.region.clone(), - )); - } - maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - } - } + if let Some(client) = redis_client { + // project info cache and invalidation of that cache. + let cache = api.caches.project_info.clone(); + maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone())); + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); - // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. - // This prevents immediate exit and pod restart, - // which can cause hammering of the redis in case of connection issues. - if let Some(mut redis_kv_client) = redis_kv_client { + // Try to connect to Redis 3 times with 1 + (0..0.1) second interval. + // This prevents immediate exit and pod restart, + // which can cause hammering of the redis in case of connection issues. + // cancellation key management + let mut redis_kv_client = RedisKVClient::new(client.clone()); for attempt in (0..3).with_position() { match redis_kv_client.try_connect().await { Ok(()) => { @@ -545,14 +516,12 @@ pub async fn run() -> anyhow::Result<()> { } } } - } - if let Some(regional_redis_client) = regional_redis_client { + // listen for notifications of new projects/endpoints/branches let cache = api.caches.endpoints_cache.clone(); - let con = regional_redis_client; let span = tracing::info_span!("endpoints_cache"); maintenance_tasks.spawn( - async move { cache.do_read(con, cancellation_token.clone()).await } + async move { cache.do_read(client, cancellation_token.clone()).await } .instrument(span), ); } @@ -681,7 +650,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, proxy_protocol_v2: args.proxy_protocol_v2, handshake_timeout: args.handshake_timeout, - region: args.region.clone(), wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, connect_compute_locks, connect_to_compute: compute_config, @@ -843,21 +811,18 @@ fn build_auth_backend( async fn configure_redis( args: &ProxyCliArgs, -) -> anyhow::Result<( - Option, - Option, -)> { +) -> anyhow::Result> { // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { + let redis_client = match &*args.redis_auth_type { + "plain" => match &args.redis_plain { None => { - bail!("plain auth requires redis_notifications to be set"); + bail!("plain auth requires redis_plain to be set"); } Some(url) => { Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) } }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { + "irsa" => match (&args.redis_host, args.redis_port) { (Some(host), Some(port)) => Some( ConnectionWithCredentialsProvider::new_with_credentials_provider( host.clone(), @@ -881,18 +846,12 @@ async fn configure_redis( bail!("redis-host and redis-port must be specified together"); } }, - _ => { - bail!("unknown auth type given"); + auth_type => { + bail!("unknown auth type {auth_type:?} given") } }; - let redis_notifications_client = if let Some(url) = &args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) - } else { - regional_redis_client.clone() - }; - - Ok((regional_redis_client, redis_notifications_client)) + Ok(redis_client) } #[cfg(test)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 248584a19a..cee15ac7fa 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -22,7 +22,6 @@ pub struct ProxyConfig { pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, pub proxy_protocol_v2: ProxyProtocolV2, - pub region: String, pub handshake_timeout: Duration, pub wake_compute_retry_config: RetryConfig, pub connect_compute_locks: ApiLocks, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 113a11beab..112465a89b 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -89,12 +89,7 @@ pub async fn task_main( } } - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Tcp, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_client( config, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 24268997ba..df1c4e194a 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -46,7 +46,6 @@ struct RequestContextInner { pub(crate) session_id: Uuid, pub(crate) protocol: Protocol, first_packet: chrono::DateTime, - region: &'static str, pub(crate) span: Span, // filled in as they are discovered @@ -94,7 +93,6 @@ impl Clone for RequestContext { session_id: inner.session_id, protocol: inner.protocol, first_packet: inner.first_packet, - region: inner.region, span: info_span!("background_task"), project: inner.project, @@ -124,12 +122,7 @@ impl Clone for RequestContext { } impl RequestContext { - pub fn new( - session_id: Uuid, - conn_info: ConnectionInfo, - protocol: Protocol, - region: &'static str, - ) -> Self { + pub fn new(session_id: Uuid, conn_info: ConnectionInfo, protocol: Protocol) -> Self { // TODO: be careful with long lived spans let span = info_span!( "connect_request", @@ -145,7 +138,6 @@ impl RequestContext { session_id, protocol, first_packet: Utc::now(), - region, span, project: None, @@ -179,7 +171,7 @@ impl RequestContext { let ip = IpAddr::from([127, 0, 0, 1]); let addr = SocketAddr::new(ip, 5432); let conn_info = ConnectionInfo { addr, extra: None }; - RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test") + RequestContext::new(Uuid::now_v7(), conn_info, Protocol::Tcp) } pub(crate) fn console_application_name(&self) -> String { diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index c9d3905abd..b55cc14532 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; #[derive(parquet_derive::ParquetRecordWriter)] pub(crate) struct RequestData { - region: &'static str, + region: String, protocol: &'static str, /// Must be UTC. The derive macro doesn't like the timezones timestamp: chrono::NaiveDateTime, @@ -147,7 +147,7 @@ impl From<&RequestContextInner> for RequestData { }), jwt_issuer: value.jwt_issuer.clone(), protocol: value.protocol.as_str(), - region: value.region, + region: String::new(), error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, cold_start_info: value.cold_start_info.as_str(), @@ -167,6 +167,7 @@ impl From<&RequestContextInner> for RequestData { pub async fn worker( cancellation_token: CancellationToken, config: ParquetUploadArgs, + region: String, ) -> anyhow::Result<()> { let Some(remote_storage_config) = config.parquet_upload_remote_storage else { tracing::warn!("parquet request upload: no s3 bucket configured"); @@ -232,12 +233,17 @@ pub async fn worker( .context("remote storage for disconnect events init")?; let parquet_config_disconnect = parquet_config.clone(); tokio::try_join!( - worker_inner(storage, rx, parquet_config), - worker_inner(storage_disconnect, rx_disconnect, parquet_config_disconnect) + worker_inner(storage, rx, parquet_config, ®ion), + worker_inner( + storage_disconnect, + rx_disconnect, + parquet_config_disconnect, + ®ion + ) ) .map(|_| ()) } else { - worker_inner(storage, rx, parquet_config).await + worker_inner(storage, rx, parquet_config, ®ion).await } } @@ -257,6 +263,7 @@ async fn worker_inner( storage: GenericRemoteStorage, rx: impl Stream, config: ParquetConfig, + region: &str, ) -> anyhow::Result<()> { #[cfg(any(test, feature = "testing"))] let storage = if config.test_remote_failures > 0 { @@ -277,7 +284,8 @@ async fn worker_inner( let mut last_upload = time::Instant::now(); let mut len = 0; - while let Some(row) = rx.next().await { + while let Some(mut row) = rx.next().await { + region.clone_into(&mut row.region); rows.push(row); let force = last_upload.elapsed() > config.max_duration; if rows.len() == config.rows_per_group || force { @@ -533,7 +541,7 @@ mod tests { auth_method: None, jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], - region: "us-east-1", + region: String::new(), error: None, success: rng.r#gen(), cold_start_info: "no", @@ -565,7 +573,9 @@ mod tests { .await .unwrap(); - worker_inner(storage, rx, config).await.unwrap(); + worker_inner(storage, rx, config, "us-east-1") + .await + .unwrap(); let mut files = WalkDir::new(tmpdir.as_std_path()) .into_iter() diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 6947e07488..6b84e47982 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -122,12 +122,7 @@ pub async fn task_main( } } - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Tcp, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); let res = handle_client( config, diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 0cd539188a..2e40f5bf60 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -139,12 +139,6 @@ impl RateBucketInfo { Self::new(200, Duration::from_secs(600)), ]; - // For all the sessions will be cancel key. So this limit is essentially global proxy limit. - pub const DEFAULT_REDIS_SET: [Self; 2] = [ - Self::new(100_000, Duration::from_secs(1)), - Self::new(50_000, Duration::from_secs(10)), - ]; - pub fn rps(&self) -> f64 { (self.max_rpi as f64) / self.interval.as_secs_f64() } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index f8d3b5cc66..671fe09b0b 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -5,11 +5,9 @@ use redis::aio::ConnectionLike; use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; pub struct RedisKVClient { client: ConnectionWithCredentialsProvider, - limiter: GlobalRateLimiter, } #[allow(async_fn_in_trait)] @@ -30,11 +28,8 @@ impl Queryable for Cmd { } impl RedisKVClient { - pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { - Self { - client, - limiter: GlobalRateLimiter::new(info.into()), - } + pub fn new(client: ConnectionWithCredentialsProvider) -> Self { + Self { client } } pub async fn try_connect(&mut self) -> anyhow::Result<()> { @@ -49,11 +44,6 @@ impl RedisKVClient { &mut self, q: &impl Queryable, ) -> anyhow::Result { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping query"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - let e = match q.query(&mut self.client).await { Ok(t) => return Ok(t), Err(e) => e, diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 6c8260027f..973a4c5b02 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -141,29 +141,19 @@ where struct MessageHandler { cache: Arc, - region_id: String, } impl Clone for MessageHandler { fn clone(&self) -> Self { Self { cache: self.cache.clone(), - region_id: self.region_id.clone(), } } } impl MessageHandler { - pub(crate) fn new(cache: Arc, region_id: String) -> Self { - Self { cache, region_id } - } - - pub(crate) async fn increment_active_listeners(&self) { - self.cache.increment_active_listeners().await; - } - - pub(crate) async fn decrement_active_listeners(&self) { - self.cache.decrement_active_listeners().await; + pub(crate) fn new(cache: Arc) -> Self { + Self { cache } } #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))] @@ -276,7 +266,7 @@ async fn handle_messages( } let mut conn = match try_connect(&redis).await { Ok(conn) => { - handler.increment_active_listeners().await; + handler.cache.increment_active_listeners().await; conn } Err(e) => { @@ -297,11 +287,11 @@ async fn handle_messages( } } if cancellation_token.is_cancelled() { - handler.decrement_active_listeners().await; + handler.cache.decrement_active_listeners().await; return Ok(()); } } - handler.decrement_active_listeners().await; + handler.cache.decrement_active_listeners().await; } } @@ -310,12 +300,11 @@ async fn handle_messages( pub async fn task_main( redis: ConnectionWithCredentialsProvider, cache: Arc, - region_id: String, ) -> anyhow::Result where C: ProjectInfoCache + Send + Sync + 'static, { - let handler = MessageHandler::new(cache, region_id); + let handler = MessageHandler::new(cache); // 6h - 1m. // There will be 1 minute overlap between two tasks. But at least we can be sure that no message is lost. let mut interval = tokio::time::interval(std::time::Duration::from_secs(6 * 60 * 60 - 60)); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index ed33bf1246..d8942bb814 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -417,12 +417,7 @@ async fn request_handler( if config.http_config.accept_websockets && framed_websockets::upgrade::is_upgrade_request(&request) { - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Ws, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Ws); ctx.set_user_agent( request @@ -462,12 +457,7 @@ async fn request_handler( // Return the response so the spawned future can continue. Ok(response.map(|b| b.map_err(|x| match x {}).boxed())) } else if request.uri().path() == "/sql" && *request.method() == Method::POST { - let ctx = RequestContext::new( - session_id, - conn_info, - crate::metrics::Protocol::Http, - &config.region, - ); + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Http); let span = ctx.span(); let testodrome_id = request From 605fb04f8912d5939d72c9c7b17b8c543f8fc078 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Thu, 26 Jun 2025 17:26:24 +0400 Subject: [PATCH 23/50] pageserver: use bounded sender for basebackup cache (#12342) ## Problem Basebackup cache now uses unbounded channel for prepare requests. In theory it can grow large if the cache is hung and does not process the requests. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Replace an unbounded channel with a bounded one, the size is configurable. - Add `pageserver_basebackup_cache_prepare_queue_size` to observe the size of the queue. - Refactor a bit to move all metrics logic to `basebackup_cache.rs` --- libs/pageserver_api/src/config.rs | 4 ++ pageserver/src/basebackup_cache.rs | 109 ++++++++++++++++++++++++----- pageserver/src/bin/pageserver.rs | 13 ++-- pageserver/src/metrics.rs | 8 +++ pageserver/src/page_service.rs | 15 +--- pageserver/src/tenant.rs | 20 +++--- pageserver/src/tenant/mgr.rs | 16 +++-- pageserver/src/tenant/timeline.rs | 30 ++++---- 8 files changed, 142 insertions(+), 73 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 7926e839cf..0cfa1c8485 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -371,6 +371,9 @@ pub struct BasebackupCacheConfig { // TODO(diko): support max_entry_size_bytes. // pub max_entry_size_bytes: u64, pub max_size_entries: usize, + /// Size of the channel used to send prepare requests to the basebackup cache worker. + /// If exceeded, new prepare requests will be dropped. + pub prepare_channel_size: usize, } impl Default for BasebackupCacheConfig { @@ -380,6 +383,7 @@ impl Default for BasebackupCacheConfig { max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB // max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB max_size_entries: 1000, + prepare_channel_size: 100, } } } diff --git a/pageserver/src/basebackup_cache.rs b/pageserver/src/basebackup_cache.rs index 69438dae7f..4966fee2d7 100644 --- a/pageserver/src/basebackup_cache.rs +++ b/pageserver/src/basebackup_cache.rs @@ -6,7 +6,7 @@ use metrics::core::{AtomicU64, GenericCounter}; use pageserver_api::{config::BasebackupCacheConfig, models::TenantState}; use tokio::{ io::{AsyncWriteExt, BufWriter}, - sync::mpsc::{UnboundedReceiver, UnboundedSender}, + sync::mpsc::{Receiver, Sender, error::TrySendError}, }; use tokio_util::sync::CancellationToken; use utils::{ @@ -19,8 +19,8 @@ use crate::{ basebackup::send_basebackup_tarball, context::{DownloadBehavior, RequestContext}, metrics::{ - BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ, - BASEBACKUP_CACHE_SIZE, + BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE, + BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE, }, task_mgr::TaskKind, tenant::{ @@ -35,8 +35,8 @@ pub struct BasebackupPrepareRequest { pub lsn: Lsn, } -pub type BasebackupPrepareSender = UnboundedSender; -pub type BasebackupPrepareReceiver = UnboundedReceiver; +pub type BasebackupPrepareSender = Sender; +pub type BasebackupPrepareReceiver = Receiver; #[derive(Clone)] struct CacheEntry { @@ -60,40 +60,65 @@ struct CacheEntry { /// and ~1 RPS for get requests. pub struct BasebackupCache { data_dir: Utf8PathBuf, + config: Option, entries: std::sync::Mutex>, + prepare_sender: BasebackupPrepareSender, + read_hit_count: GenericCounter, read_miss_count: GenericCounter, read_err_count: GenericCounter, + + prepare_skip_count: GenericCounter, } impl BasebackupCache { - /// Creates a BasebackupCache and spawns the background task. - /// The initialization of the cache is performed in the background and does not - /// block the caller. The cache will return `None` for any get requests until - /// initialization is complete. - pub fn spawn( - runtime_handle: &tokio::runtime::Handle, + /// Create a new BasebackupCache instance. + /// Also returns a BasebackupPrepareReceiver which is needed to start + /// the background task. + /// The cache is initialized from the data_dir in the background task. + /// The cache will return `None` for any get requests until the initialization is complete. + /// The background task is spawned separately using [`Self::spawn_background_task`] + /// to avoid a circular dependency between the cache and the tenant manager. + pub fn new( data_dir: Utf8PathBuf, config: Option, - prepare_receiver: BasebackupPrepareReceiver, - tenant_manager: Arc, - cancel: CancellationToken, - ) -> Arc { + ) -> (Arc, BasebackupPrepareReceiver) { + let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1); + + let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size); + let cache = Arc::new(BasebackupCache { data_dir, - + config, entries: std::sync::Mutex::new(HashMap::new()), + prepare_sender, read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]), read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]), read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]), + + prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]), }); - if let Some(config) = config { + (cache, prepare_receiver) + } + + /// Spawns the background task. + /// The background task initializes the cache from the disk, + /// processes prepare requests, and cleans up outdated cache entries. + /// Noop if the cache is disabled (config is None). + pub fn spawn_background_task( + self: Arc, + runtime_handle: &tokio::runtime::Handle, + prepare_receiver: BasebackupPrepareReceiver, + tenant_manager: Arc, + cancel: CancellationToken, + ) { + if let Some(config) = self.config.clone() { let background = BackgroundTask { - c: cache.clone(), + c: self, config, tenant_manager, @@ -108,8 +133,45 @@ impl BasebackupCache { }; runtime_handle.spawn(background.run(prepare_receiver)); } + } - cache + /// Send a basebackup prepare request to the background task. + /// The basebackup will be prepared asynchronously, it does not block the caller. + /// The request will be skipped if any cache limits are exceeded. + pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) { + let req = BasebackupPrepareRequest { + tenant_shard_id, + timeline_id, + lsn, + }; + + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc(); + let res = self.prepare_sender.try_send(req); + + if let Err(e) = res { + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); + self.prepare_skip_count.inc(); + match e { + TrySendError::Full(_) => { + // Basebackup prepares are pretty rare, normally we should not hit this. + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %lsn, + "Basebackup prepare channel is full, skipping the request" + ); + } + TrySendError::Closed(_) => { + // Normal during shutdown, not critical. + tracing::info!( + tenant_id = %tenant_shard_id.tenant_id, + %timeline_id, + %lsn, + "Basebackup prepare channel is closed, skipping the request" + ); + } + } + } } /// Gets a basebackup entry from the cache. @@ -122,6 +184,10 @@ impl BasebackupCache { timeline_id: TimelineId, lsn: Lsn, ) -> Option { + if !self.is_enabled() { + return None; + } + // Fast path. Check if the entry exists using the in-memory state. let tti = TenantTimelineId::new(tenant_id, timeline_id); if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) { @@ -149,6 +215,10 @@ impl BasebackupCache { } } + pub fn is_enabled(&self) -> bool { + self.config.is_some() + } + // Private methods. fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String { @@ -366,6 +436,7 @@ impl BackgroundTask { loop { tokio::select! { Some(req) = prepare_receiver.recv() => { + BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec(); if let Err(err) = self.prepare_basebackup( req.tenant_shard_id, req.timeline_id, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index d137d651eb..327384fd82 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -569,8 +569,10 @@ fn start_pageserver( pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); // Scan the local 'tenants/' directory and start loading the tenants - let (basebackup_prepare_sender, basebackup_prepare_receiver) = - tokio::sync::mpsc::unbounded_channel(); + let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new( + conf.basebackup_cache_dir(), + conf.basebackup_cache_config.clone(), + ); let deletion_queue_client = deletion_queue.new_client(); let background_purges = mgr::BackgroundPurges::default(); @@ -582,7 +584,7 @@ fn start_pageserver( remote_storage: remote_storage.clone(), deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache: Arc::clone(&basebackup_cache), feature_resolver: feature_resolver.clone(), }, shutdown_pageserver.clone(), @@ -590,10 +592,8 @@ fn start_pageserver( let tenant_manager = Arc::new(tenant_manager); BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?; - let basebackup_cache = BasebackupCache::spawn( + basebackup_cache.spawn_background_task( BACKGROUND_RUNTIME.handle(), - conf.basebackup_cache_dir(), - conf.basebackup_cache_config.clone(), basebackup_prepare_receiver, Arc::clone(&tenant_manager), shutdown_pageserver.child_token(), @@ -806,7 +806,6 @@ fn start_pageserver( } else { None }, - basebackup_cache, ); // Spawn a Pageserver gRPC server task. It will spawn separate tasks for diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 7929b094b4..21faceef49 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -4439,6 +4439,14 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge!( + "pageserver_basebackup_cache_prepare_queue_size", + "Number of requests in the basebackup prepare channel" + ) + .expect("failed to define a metric") +}); + static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_config_ignored_items", diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index dd02947e5c..0287a2bdb5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -62,7 +62,6 @@ use utils::{failpoint_support, span_record}; use crate::auth::check_permission; use crate::basebackup::{self, BasebackupError}; -use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -137,7 +136,6 @@ pub fn spawn( perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, tls_config: Option>, - basebackup_cache: Arc, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -159,7 +157,6 @@ pub fn spawn( conf.pg_auth_type, tls_config, conf.page_service_pipelining.clone(), - basebackup_cache, libpq_ctx, cancel.clone(), ) @@ -218,7 +215,6 @@ pub async fn libpq_listener_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, - basebackup_cache: Arc, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -262,7 +258,6 @@ pub async fn libpq_listener_main( auth_type, tls_config.clone(), pipelining_config.clone(), - Arc::clone(&basebackup_cache), connection_ctx, connections_cancel.child_token(), gate_guard, @@ -305,7 +300,6 @@ async fn page_service_conn_main( auth_type: AuthType, tls_config: Option>, pipelining_config: PageServicePipeliningConfig, - basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -371,7 +365,6 @@ async fn page_service_conn_main( pipelining_config, conf.get_vectored_concurrent_io, perf_span_fields, - basebackup_cache, connection_ctx, cancel.clone(), gate_guard, @@ -425,8 +418,6 @@ struct PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, - basebackup_cache: Arc, - gate_guard: GateGuard, } @@ -912,7 +903,6 @@ impl PageServerHandler { pipelining_config: PageServicePipeliningConfig, get_vectored_concurrent_io: GetVectoredConcurrentIo, perf_span_fields: ConnectionPerfSpanFields, - basebackup_cache: Arc, connection_ctx: RequestContext, cancel: CancellationToken, gate_guard: GateGuard, @@ -926,7 +916,6 @@ impl PageServerHandler { cancel, pipelining_config, get_vectored_concurrent_io, - basebackup_cache, gate_guard, } } @@ -2626,9 +2615,7 @@ impl PageServerHandler { && lsn.is_some() && prev_lsn.is_none() { - self.basebackup_cache - .get(tenant_id, timeline_id, lsn.unwrap()) - .await + timeline.get_cached_basebackup(lsn.unwrap()).await } else { None } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index c71655ce17..2613528143 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -80,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit use self::timeline::{ EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError, }; -use crate::basebackup_cache::BasebackupPrepareSender; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context; use crate::context::RequestContextBuilder; @@ -162,7 +162,7 @@ pub struct TenantSharedResources { pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, pub l0_flush_global_state: L0FlushGlobalState, - pub basebackup_prepare_sender: BasebackupPrepareSender, + pub basebackup_cache: Arc, pub feature_resolver: FeatureResolver, } @@ -331,7 +331,7 @@ pub struct TenantShard { deletion_queue_client: DeletionQueueClient, /// A channel to send async requests to prepare a basebackup for the basebackup cache. - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`]. cached_logical_sizes: tokio::sync::Mutex>, @@ -1363,7 +1363,7 @@ impl TenantShard { remote_storage, deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, } = resources; @@ -1380,7 +1380,7 @@ impl TenantShard { remote_storage.clone(), deletion_queue_client, l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, )); @@ -4380,7 +4380,7 @@ impl TenantShard { remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, l0_flush_global_state: L0FlushGlobalState, - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, feature_resolver: FeatureResolver, ) -> TenantShard { assert!(!attached_conf.location.generation.is_none()); @@ -4485,7 +4485,7 @@ impl TenantShard { ongoing_timeline_detach: std::sync::Mutex::default(), gc_block: Default::default(), l0_flush_global_state, - basebackup_prepare_sender, + basebackup_cache, feature_resolver, } } @@ -5414,7 +5414,7 @@ impl TenantShard { pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(), l0_compaction_trigger: self.l0_compaction_trigger.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), - basebackup_prepare_sender: self.basebackup_prepare_sender.clone(), + basebackup_cache: self.basebackup_cache.clone(), feature_resolver: self.feature_resolver.clone(), } } @@ -6000,7 +6000,7 @@ pub(crate) mod harness { ) -> anyhow::Result> { let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager)); - let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel(); + let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant = Arc::new(TenantShard::new( TenantState::Attaching, @@ -6018,7 +6018,7 @@ pub(crate) mod harness { self.deletion_queue.new_client(), // TODO: ideally we should run all unit tests with both configs L0FlushGlobalState::new(L0FlushConfig::default()), - basebackup_requst_sender, + basebackup_cache, FeatureResolver::new_disabled(), )); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 76937dd959..0a494e7923 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2891,14 +2891,18 @@ mod tests { use std::collections::BTreeMap; use std::sync::Arc; + use camino::Utf8PathBuf; use storage_broker::BrokerClientChannel; use tracing::Instrument; use super::super::harness::TenantHarness; use super::TenantsMap; - use crate::tenant::{ - TenantSharedResources, - mgr::{BackgroundPurges, TenantManager, TenantSlot}, + use crate::{ + basebackup_cache::BasebackupCache, + tenant::{ + TenantSharedResources, + mgr::{BackgroundPurges, TenantManager, TenantSlot}, + }, }; #[tokio::test(start_paused = true)] @@ -2924,9 +2928,7 @@ mod tests { // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually // permit it to proceed: that will stick the tenant in InProgress - let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::< - crate::basebackup_cache::BasebackupPrepareRequest, - >(); + let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None); let tenant_manager = TenantManager { tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)), @@ -2940,7 +2942,7 @@ mod tests { l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new( h.conf.l0_flush.clone(), ), - basebackup_prepare_sender, + basebackup_cache, feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(), }, cancel: tokio_util::sync::CancellationToken::new(), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4ca005bfd4..bec2f0ed52 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -95,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ - AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded, + AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, debug_assert_current_span_has_tenant_and_timeline_id, }; use crate::PERF_TRACE_TARGET; use crate::aux_file::AuxFileSizeEstimator; -use crate::basebackup_cache::BasebackupPrepareRequest; +use crate::basebackup_cache::BasebackupCache; use crate::config::PageServerConf; use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, @@ -201,7 +201,7 @@ pub struct TimelineResources { pub pagestream_throttle_metrics: Arc, pub l0_compaction_trigger: Arc, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, - pub basebackup_prepare_sender: BasebackupPrepareSender, + pub basebackup_cache: Arc, pub feature_resolver: FeatureResolver, } @@ -448,7 +448,7 @@ pub struct Timeline { wait_lsn_log_slow: tokio::sync::Semaphore, /// A channel to send async requests to prepare a basebackup for the basebackup cache. - basebackup_prepare_sender: BasebackupPrepareSender, + basebackup_cache: Arc, feature_resolver: FeatureResolver, } @@ -2500,6 +2500,13 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled) } + /// Try to get a basebackup from the on-disk cache. + pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option { + self.basebackup_cache + .get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn) + .await + } + /// Prepare basebackup for the given LSN and store it in the basebackup cache. /// The method is asynchronous and returns immediately. /// The actual basebackup preparation is performed in the background @@ -2521,17 +2528,8 @@ impl Timeline { return; } - let res = self - .basebackup_prepare_sender - .send(BasebackupPrepareRequest { - tenant_shard_id: self.tenant_shard_id, - timeline_id: self.timeline_id, - lsn, - }); - if let Err(e) = res { - // May happen during shutdown, it's not critical. - info!("Failed to send shutdown checkpoint: {e:#}"); - } + self.basebackup_cache + .send_prepare(self.tenant_shard_id, self.timeline_id, lsn); } } @@ -3088,7 +3086,7 @@ impl Timeline { wait_lsn_log_slow: tokio::sync::Semaphore::new(1), - basebackup_prepare_sender: resources.basebackup_prepare_sender, + basebackup_cache: resources.basebackup_cache, feature_resolver: resources.feature_resolver, }; From 33c0d5e2f46f04dc3384095f2b162f15ed9c46b0 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 26 Jun 2025 11:49:08 -0400 Subject: [PATCH 24/50] fix(pageserver): make posthog config parsing more robust (#12356) ## Problem In our infra config, we have to split server_api_key and other fields in two files: the former one in the sops file, and the latter one in the normal config. It creates the situation that we might misconfigure some regions that it only has part of the fields available, causing storcon/pageserver refuse to start. ## Summary of changes Allow PostHog config to have part of the fields available. Parse it later. Signed-off-by: Alex Chi Z --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 1 + libs/pageserver_api/src/config.rs | 48 +++++++++++++++++-- pageserver/src/config.rs | 17 +++++++ pageserver/src/feature_resolver.rs | 30 +++++++----- storage_controller/src/main.rs | 12 +++-- .../src/service/feature_flag.rs | 16 ++----- 7 files changed, 95 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1fee728d9c..7098711bb4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4408,6 +4408,7 @@ dependencies = [ "postgres_backend", "postgres_ffi_types", "postgres_versioninfo", + "posthog_client_lite", "rand 0.8.5", "remote_storage", "reqwest", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index a34e065788..6dc17b670b 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -19,6 +19,7 @@ byteorder.workspace = true utils.workspace = true postgres_ffi_types.workspace = true postgres_versioninfo.workspace = true +posthog_client_lite.workspace = true enum-map.workspace = true strum.workspace = true strum_macros.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 0cfa1c8485..6489fbe9a1 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -4,6 +4,7 @@ use camino::Utf8PathBuf; mod tests; use const_format::formatcp; +use posthog_client_lite::PostHogClientConfig; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; @@ -68,15 +69,25 @@ impl Display for NodeMetadata { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] pub struct PostHogConfig { /// PostHog project ID - pub project_id: String, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub project_id: Option, /// Server-side (private) API key - pub server_api_key: String, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub server_api_key: Option, /// Client-side (public) API key - pub client_api_key: String, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub client_api_key: Option, /// Private API URL - pub private_api_url: String, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub private_api_url: Option, /// Public API URL - pub public_api_url: String, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub public_api_url: Option, /// Refresh interval for the feature flag spec. /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API. @@ -86,6 +97,33 @@ pub struct PostHogConfig { pub refresh_interval: Option, } +impl PostHogConfig { + pub fn try_into_posthog_config(self) -> Result { + let Some(project_id) = self.project_id else { + return Err("project_id is required"); + }; + let Some(server_api_key) = self.server_api_key else { + return Err("server_api_key is required"); + }; + let Some(client_api_key) = self.client_api_key else { + return Err("client_api_key is required"); + }; + let Some(private_api_url) = self.private_api_url else { + return Err("private_api_url is required"); + }; + let Some(public_api_url) = self.public_api_url else { + return Err("public_api_url is required"); + }; + Ok(PostHogClientConfig { + project_id, + server_api_key, + client_api_key, + private_api_url, + public_api_url, + }) + } +} + /// `pageserver.toml` /// /// We use serde derive with `#[serde(default)]` to generate a deserializer diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 5b51a9617b..12e2cd99d9 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -781,4 +781,21 @@ mod tests { PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) .expect("parse_and_validate"); } + + #[test] + fn test_config_posthog_incomplete_config_is_valid() { + let input = r#" + control_plane_api = "http://localhost:6666" + + [posthog_config] + server_api_key = "phs_AAA" + private_api_url = "https://us.posthog.com" + public_api_url = "https://us.i.posthog.com" + "#; + let config_toml = toml_edit::de::from_str::(input) + .expect("posthogconfig is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); + } } diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs index 92a9ef2880..3080b0db34 100644 --- a/pageserver/src/feature_resolver.rs +++ b/pageserver/src/feature_resolver.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; use arc_swap::ArcSwap; use pageserver_api::config::NodeMetadata; use posthog_client_lite::{ - CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError, + CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError, PostHogFlagFilterPropertyValue, }; use remote_storage::RemoteStorageKind; @@ -45,16 +45,24 @@ impl FeatureResolver { ) -> anyhow::Result { // DO NOT block in this function: make it return as fast as possible to avoid startup delays. if let Some(posthog_config) = &conf.posthog_config { - let inner = FeatureResolverBackgroundLoop::new( - PostHogClientConfig { - server_api_key: posthog_config.server_api_key.clone(), - client_api_key: posthog_config.client_api_key.clone(), - project_id: posthog_config.project_id.clone(), - private_api_url: posthog_config.private_api_url.clone(), - public_api_url: posthog_config.public_api_url.clone(), - }, - shutdown_pageserver, - ); + let posthog_client_config = match posthog_config.clone().try_into_posthog_config() { + Ok(config) => config, + Err(e) => { + tracing::warn!( + "invalid posthog config, skipping posthog integration: {}", + e + ); + return Ok(FeatureResolver { + inner: None, + internal_properties: None, + force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new( + HashMap::new(), + ))), + }); + } + }; + let inner = + FeatureResolverBackgroundLoop::new(posthog_client_config, shutdown_pageserver); let inner = Arc::new(inner); // The properties shared by all tenants on this pageserver. diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 296a98e620..d1c2858d6f 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -560,9 +560,15 @@ async fn async_main() -> anyhow::Result<()> { let cancel_bg = cancel.clone(); let task = tokio::task::spawn( async move { - let feature_flag_service = FeatureFlagService::new(service, posthog_config); - let feature_flag_service = Arc::new(feature_flag_service); - feature_flag_service.run(cancel_bg).await + match FeatureFlagService::new(service, posthog_config) { + Ok(feature_flag_service) => { + let feature_flag_service = Arc::new(feature_flag_service); + feature_flag_service.run(cancel_bg).await + } + Err(e) => { + tracing::warn!("Failed to create feature flag service: {}", e); + } + }; } .instrument(tracing::info_span!("feature_flag_service")), ); diff --git a/storage_controller/src/service/feature_flag.rs b/storage_controller/src/service/feature_flag.rs index 645eb75237..f44bf046b9 100644 --- a/storage_controller/src/service/feature_flag.rs +++ b/storage_controller/src/service/feature_flag.rs @@ -3,7 +3,7 @@ use std::{sync::Arc, time::Duration}; use futures::StreamExt; use pageserver_api::config::PostHogConfig; use pageserver_client::mgmt_api; -use posthog_client_lite::{PostHogClient, PostHogClientConfig}; +use posthog_client_lite::PostHogClient; use reqwest::StatusCode; use tokio::time::MissedTickBehavior; use tokio_util::sync::CancellationToken; @@ -20,20 +20,14 @@ pub struct FeatureFlagService { const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(30); impl FeatureFlagService { - pub fn new(service: Arc, config: PostHogConfig) -> Self { - let client = PostHogClient::new(PostHogClientConfig { - project_id: config.project_id.clone(), - server_api_key: config.server_api_key.clone(), - client_api_key: config.client_api_key.clone(), - private_api_url: config.private_api_url.clone(), - public_api_url: config.public_api_url.clone(), - }); - Self { + pub fn new(service: Arc, config: PostHogConfig) -> Result { + let client = PostHogClient::new(config.clone().try_into_posthog_config()?); + Ok(Self { service, config, client, http_client: reqwest::Client::new(), - } + }) } async fn refresh(self: Arc, cancel: CancellationToken) -> Result<(), anyhow::Error> { From a2d2108e6af466bb3d8ec11ab4c019726e07ff5c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 26 Jun 2025 17:52:15 +0200 Subject: [PATCH 25/50] pageserver: use base backup cache with gRPC (#12352) ## Problem gRPC base backups do not use the base backup cache. Touches https://github.com/neondatabase/neon/issues/11728. ## Summary of changes Integrate gRPC base backups with the base backup cache. Also fixes a bug where the base backup cache did not differentiate between primary/replica base backups (at least I think that's a bug?). --- pageserver/src/page_service.rs | 67 ++++++++++++++++++------------- pageserver/src/tenant/timeline.rs | 24 +++++++++++ 2 files changed, 63 insertions(+), 28 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 0287a2bdb5..c04f6e2b47 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -12,7 +12,7 @@ use std::task::{Context, Poll}; use std::time::{Duration, Instant, SystemTime}; use std::{io, str}; -use anyhow::{Context as _, anyhow, bail}; +use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; @@ -2608,18 +2608,9 @@ impl PageServerHandler { } else { let mut writer = BufWriter::new(pgb.copyout_writer()); - let cached = { - // Basebackup is cached only for this combination of parameters. - if timeline.is_basebackup_cache_enabled() - && gzip - && lsn.is_some() - && prev_lsn.is_none() - { - timeline.get_cached_basebackup(lsn.unwrap()).await - } else { - None - } - }; + let cached = timeline + .get_cached_basebackup_if_enabled(lsn, prev_lsn, full_backup, replica, gzip) + .await; if let Some(mut cached) = cached { from_cache = true; @@ -3555,21 +3546,41 @@ impl proto::PageService for GrpcPageServiceHandler { page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest), }; - let result = basebackup::send_basebackup_tarball( - &mut simplex_write, - &timeline, - req.lsn, - None, - req.full, - req.replica, - gzip_level, - &ctx, - ) - .instrument(span) // propagate request span - .await; - simplex_write.shutdown().await.map_err(|err| { - BasebackupError::Server(anyhow!("simplex shutdown failed: {err}")) - })?; + // Check for a cached basebackup. + let cached = timeline + .get_cached_basebackup_if_enabled( + req.lsn, + None, + req.full, + req.replica, + gzip_level.is_some(), + ) + .await; + + let result = if let Some(mut cached) = cached { + // If we have a cached basebackup, send it. + tokio::io::copy(&mut cached, &mut simplex_write) + .await + .map(|_| ()) + .map_err(|err| BasebackupError::Client(err, "cached,copy")) + } else { + basebackup::send_basebackup_tarball( + &mut simplex_write, + &timeline, + req.lsn, + None, + req.full, + req.replica, + gzip_level, + &ctx, + ) + .instrument(span) // propagate request span + .await + }; + simplex_write + .shutdown() + .await + .map_err(|err| BasebackupError::Client(err, "simplex_write"))?; result }); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index bec2f0ed52..04a4bb84a3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2507,6 +2507,30 @@ impl Timeline { .await } + /// Convenience method to attempt fetching a basebackup for the timeline if enabled and safe for + /// the given request parameters. + /// + /// TODO: consider moving this onto GrpcPageServiceHandler once the libpq handler is gone. + pub async fn get_cached_basebackup_if_enabled( + &self, + lsn: Option, + prev_lsn: Option, + full: bool, + replica: bool, + gzip: bool, + ) -> Option { + if !self.is_basebackup_cache_enabled() || !self.basebackup_cache.is_enabled() { + return None; + } + // We have to know which LSN to fetch the basebackup for. + let lsn = lsn?; + // We only cache gzipped, non-full basebackups for primary computes with automatic prev_lsn. + if prev_lsn.is_some() || full || replica || !gzip { + return None; + } + self.get_cached_basebackup(lsn).await + } + /// Prepare basebackup for the given LSN and store it in the basebackup cache. /// The method is asynchronous and returns immediately. /// The actual basebackup preparation is performed in the background From 232f2447d4717be1331a0ffa82cd172f0b65b166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 26 Jun 2025 18:29:03 +0200 Subject: [PATCH 26/50] Support pull_timeline of timelines without writes (#12028) Make the safekeeper `pull_timeline` endpoint support timelines that haven't had any writes yet. In the storcon managed sk timelines world, if a safekeeper goes down temporarily, the storcon will schedule a `pull_timeline` call. There is no guarantee however that by when the safekeeper is online again, there have been writes to the timeline yet. The `snapshot` endpoint gives an error if the timeline hasn't had writes, so we avoid calling it if `timeline_start_lsn` indicates a freshly created timeline. Fixes #11422 Part of #11670 --- safekeeper/src/pull_timeline.rs | 78 ++++++++++++------- safekeeper/src/wal_storage.rs | 46 +++++------ .../regress/test_storage_controller.py | 16 +++- 3 files changed, 80 insertions(+), 60 deletions(-) diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 14aef1ee5e..1c9e5bade5 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -1,5 +1,6 @@ use std::cmp::min; use std::io::{self, ErrorKind}; +use std::ops::RangeInclusive; use std::sync::Arc; use anyhow::{Context, Result, anyhow, bail}; @@ -34,7 +35,7 @@ use crate::control_file::CONTROL_FILE_NAME; use crate::state::{EvictionState, TimelinePersistentState}; use crate::timeline::{Timeline, TimelineError, WalResidentTimeline}; use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; -use crate::wal_storage::open_wal_file; +use crate::wal_storage::{open_wal_file, wal_file_paths}; use crate::{GlobalTimelines, debug_dump, wal_backup}; /// Stream tar archive of timeline to tx. @@ -95,8 +96,8 @@ pub async fn stream_snapshot( /// State needed while streaming the snapshot. pub struct SnapshotContext { - pub from_segno: XLogSegNo, // including - pub upto_segno: XLogSegNo, // including + /// The interval of segment numbers. If None, the timeline hasn't had writes yet, so only send the control file + pub from_to_segno: Option>, pub term: Term, pub last_log_term: Term, pub flush_lsn: Lsn, @@ -174,23 +175,35 @@ pub async fn stream_snapshot_resident_guts( .await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); - let tli_dir = tli.get_timeline_dir(); - info!( - "sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}", - bctx.upto_segno - bctx.from_segno + 1, - bctx.from_segno, - bctx.upto_segno, - bctx.term, - bctx.last_log_term, - bctx.flush_lsn, - ); - for segno in bctx.from_segno..=bctx.upto_segno { - let (mut sf, is_partial) = open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await?; - let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size); - if is_partial { - wal_file_name.push_str(".partial"); + if let Some(from_to_segno) = &bctx.from_to_segno { + let tli_dir = tli.get_timeline_dir(); + info!( + "sending {} segments [{:#X}-{:#X}], term={}, last_log_term={}, flush_lsn={}", + from_to_segno.end() - from_to_segno.start() + 1, + from_to_segno.start(), + from_to_segno.end(), + bctx.term, + bctx.last_log_term, + bctx.flush_lsn, + ); + for segno in from_to_segno.clone() { + let Some((mut sf, is_partial)) = + open_wal_file(&tli_dir, segno, bctx.wal_seg_size).await? + else { + // File is not found + let (wal_file_path, _wal_file_partial_path) = + wal_file_paths(&tli_dir, segno, bctx.wal_seg_size); + tracing::warn!("couldn't find WAL segment file {wal_file_path}"); + bail!("couldn't find WAL segment file {wal_file_path}") + }; + let mut wal_file_name = XLogFileName(PG_TLI, segno, bctx.wal_seg_size); + if is_partial { + wal_file_name.push_str(".partial"); + } + ar.append_file(&wal_file_name, &mut sf).await?; } - ar.append_file(&wal_file_name, &mut sf).await?; + } else { + info!("Not including any segments into the snapshot"); } // Do the term check before ar.finish to make archive corrupted in case of @@ -338,19 +351,26 @@ impl WalResidentTimeline { // removed further than `backup_lsn`. Since we're holding shared_state // lock and setting `wal_removal_on_hold` later, it guarantees that WAL // won't be removed until we're done. + let timeline_state = shared_state.sk.state(); let from_lsn = min( - shared_state.sk.state().remote_consistent_lsn, - shared_state.sk.state().backup_lsn, + timeline_state.remote_consistent_lsn, + timeline_state.backup_lsn, + ); + let flush_lsn = shared_state.sk.flush_lsn(); + let (send_segments, msg) = if from_lsn == Lsn::INVALID { + (false, "snapshot is called on uninitialized timeline") + } else { + (true, "timeline is initialized") + }; + tracing::info!( + remote_consistent_lsn=%timeline_state.remote_consistent_lsn, + backup_lsn=%timeline_state.backup_lsn, + %flush_lsn, + "{msg}" ); - if from_lsn == Lsn::INVALID { - // this is possible if snapshot is called before handling first - // elected message - bail!("snapshot is called on uninitialized timeline"); - } let from_segno = from_lsn.segment_number(wal_seg_size); let term = shared_state.sk.state().acceptor_state.term; let last_log_term = shared_state.sk.last_log_term(); - let flush_lsn = shared_state.sk.flush_lsn(); let upto_segno = flush_lsn.segment_number(wal_seg_size); // have some limit on max number of segments as a sanity check const MAX_ALLOWED_SEGS: u64 = 1000; @@ -376,9 +396,9 @@ impl WalResidentTimeline { drop(shared_state); let tli_copy = self.wal_residence_guard().await?; + let from_to_segno = send_segments.then_some(from_segno..=upto_segno); let bctx = SnapshotContext { - from_segno, - upto_segno, + from_to_segno, term, last_log_term, flush_lsn, diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index da00df2dd7..33310706be 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -9,7 +9,7 @@ use std::cmp::{max, min}; use std::future::Future; -use std::io::{self, SeekFrom}; +use std::io::{ErrorKind, SeekFrom}; use std::pin::Pin; use anyhow::{Context, Result, bail}; @@ -794,26 +794,13 @@ impl WalReader { // Try to open local file, if we may have WAL locally if self.pos >= self.local_start_lsn { - let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await; - match res { - Ok((mut file, _)) => { - file.seek(SeekFrom::Start(xlogoff as u64)).await?; - return Ok(Box::pin(file)); - } - Err(e) => { - let is_not_found = e.chain().any(|e| { - if let Some(e) = e.downcast_ref::() { - e.kind() == io::ErrorKind::NotFound - } else { - false - } - }); - if !is_not_found { - return Err(e); - } - // NotFound is expected, fall through to remote read - } - }; + let res = open_wal_file(&self.timeline_dir, segno, self.wal_seg_size).await?; + if let Some((mut file, _)) = res { + file.seek(SeekFrom::Start(xlogoff as u64)).await?; + return Ok(Box::pin(file)); + } else { + // NotFound is expected, fall through to remote read + } } // Try to open remote file, if remote reads are enabled @@ -832,26 +819,31 @@ pub(crate) async fn open_wal_file( timeline_dir: &Utf8Path, segno: XLogSegNo, wal_seg_size: usize, -) -> Result<(tokio::fs::File, bool)> { +) -> Result> { let (wal_file_path, wal_file_partial_path) = wal_file_paths(timeline_dir, segno, wal_seg_size); // First try to open the .partial file. let mut partial_path = wal_file_path.to_owned(); partial_path.set_extension("partial"); if let Ok(opened_file) = tokio::fs::File::open(&wal_file_partial_path).await { - return Ok((opened_file, true)); + return Ok(Some((opened_file, true))); } // If that failed, try it without the .partial extension. - let pf = tokio::fs::File::open(&wal_file_path) - .await + let pf_res = tokio::fs::File::open(&wal_file_path).await; + if let Err(e) = &pf_res { + if e.kind() == ErrorKind::NotFound { + return Ok(None); + } + } + let pf = pf_res .with_context(|| format!("failed to open WAL file {wal_file_path:#}")) .map_err(|e| { - warn!("{}", e); + warn!("{e}"); e })?; - Ok((pf, false)) + Ok(Some((pf, false))) } /// Helper returning full path to WAL segment file and its .partial brother. diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 70772766d7..290ebe456b 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -4168,13 +4168,20 @@ class DeletionSubject(Enum): TENANT = "tenant" +class EmptyTimeline(Enum): + EMPTY = "empty" + NONEMPTY = "nonempty" + + @run_only_on_default_postgres("PG version is not interesting here") @pytest.mark.parametrize("restart_storcon", [RestartStorcon.RESTART, RestartStorcon.ONLINE]) @pytest.mark.parametrize("deletetion_subject", [DeletionSubject.TENANT, DeletionSubject.TIMELINE]) +@pytest.mark.parametrize("empty_timeline", [EmptyTimeline.EMPTY, EmptyTimeline.NONEMPTY]) def test_storcon_create_delete_sk_down( neon_env_builder: NeonEnvBuilder, restart_storcon: RestartStorcon, deletetion_subject: DeletionSubject, + empty_timeline: EmptyTimeline, ): """ Test that the storcon can create and delete tenants and timelines with a safekeeper being down. @@ -4226,10 +4233,11 @@ def test_storcon_create_delete_sk_down( ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") - with env.endpoints.create("child_of_main", tenant_id=tenant_id) as ep: - # endpoint should start. - ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) - ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + if empty_timeline == EmptyTimeline.NONEMPTY: + with env.endpoints.create("child_of_main", tenant_id=tenant_id) as ep: + # endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") env.storage_controller.assert_log_contains("writing pending op for sk id 1") env.safekeepers[0].start() From 72b3c9cd11e909cbd7ad507da97dda38cde015b0 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 26 Jun 2025 17:35:34 +0100 Subject: [PATCH 27/50] pageserver: fix wal receiver hang on remote client shutdown (#12348) ## Problem Druing shard splits we shut down the remote client early and allow the parent shard to keep ingesting data. While ingesting data, the wal receiver task may wait for the current flush to complete in order to apply backpressure. Notifications are delivered via `Timeline::layer_flush_done_tx`. When the remote client was being shut down the flush loop exited whithout delivering a notification. This left `Timeline::wait_flush_completion` hanging indefinitely which blocked the shutdown of the wal receiver task, and, hence, the shard split. ## Summary of Changes Deliver a final notification when the flush loop is shutting down without the timeline cancel cancellation token having fired. I tried writing a test for this, but got stuck in failpoint hell and decided it's not worth it. `test_sharding_autosplit`, which reproduces this reliably in CI, passed with the proposed fix in https://github.com/neondatabase/neon/pull/12304. Closes https://github.com/neondatabase/neon/issues/12060 --- pageserver/src/tenant/timeline.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 04a4bb84a3..7261ce783d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4680,6 +4680,16 @@ impl Timeline { mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { + // Always notify waiters about the flush loop exiting since the loop might stop + // when the timeline hasn't been cancelled. + let scopeguard_rx = layer_flush_start_rx.clone(); + scopeguard::defer! { + let (flush_counter, _) = *scopeguard_rx.borrow(); + let _ = self + .layer_flush_done_tx + .send_replace((flush_counter, Err(FlushLayerError::Cancelled))); + } + // Subscribe to L0 delta layer updates, for compaction backpressure. let mut watch_l0 = match self .layers @@ -4709,9 +4719,6 @@ impl Timeline { let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); - // Note: we do not bother transmitting into [`layer_flush_done_tx`], because - // anyone waiting on that will respect self.cancel as well: they will stop - // waiting at the same time we as drop out of this loop. return; } From 10afac87e761c223f2f6c62a28b7d1717f45c544 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 26 Jun 2025 19:45:34 +0300 Subject: [PATCH 28/50] impr(ci): Remove unnecessary 'make postgres-headers' build step (#12354) The 'make postgres' step includes installation of the headers, no need to do that separately. --- .github/workflows/build-macos.yml | 5 ----- Makefile | 6 +++++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 160c3d05bc..7fd2626332 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -94,11 +94,6 @@ jobs: run: | make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu) - - name: Get postgres headers ${{ matrix.postgres-version }} - if: steps.cache_pg.outputs.cache-hit != 'true' - run: | - make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu) - - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: diff --git a/Makefile b/Makefile index d39b9b68c8..a43411518d 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,11 @@ postgres-configure-v15: $(BUILD_DIR)/v15/config.status .PHONY: postgres-configure-v14 postgres-configure-v14: $(BUILD_DIR)/v14/config.status -# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +# Install just the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +# +# This is implicitly included in the 'postgres-%' rule, but this can be handy if you +# want to just install the headers without building PostgreSQL, e.g. for building +# extensions. .PHONY: postgres-headers-% postgres-headers-%: postgres-configure-% +@echo "Installing PostgreSQL $* headers" From 6fa1562b571538278c586b5a68d0b9a88a85cf57 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 27 Jun 2025 13:18:18 +0400 Subject: [PATCH 29/50] pageserver: increase default max_size_entries limit for basebackup cache (#12343) ## Problem Some pageservers hit `max_size_entries` limit in staging with only ~25 MiB storage used by basebackup cache. The limit is too strict. It should be safe to relax it. - Part of https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Increase the default `max_size_entries` from 1000 to 10000 --- libs/pageserver_api/src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 6489fbe9a1..00d6b61399 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -420,7 +420,7 @@ impl Default for BasebackupCacheConfig { cleanup_period: Duration::from_secs(60), max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB // max_entry_size_bytes: 16 * 1024 * 1024, // 16 MiB - max_size_entries: 1000, + max_size_entries: 10000, prepare_channel_size: 100, } } From abc1efd5a63a186cc81ba30321fd34c50f5c42f4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 27 Jun 2025 11:36:27 +0100 Subject: [PATCH 30/50] [proxy] fix connect_to_compute retry handling (#12351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Problem In #12335 I moved the `authenticate` method outside of the `connect_to_compute` loop. This triggered [e2e tests to become flaky](https://github.com/neondatabase/cloud/pull/30533). This highlighted an edge case we forgot to consider with that change. When we connect to compute, the compute IP might be cached. This cache hit might however be stale. Because we can't validate the IP is associated with a specific compute-id☨, we will succeed the connect_to_compute operation and fail when it comes to password authentication☨☨. Before the change, we were invalidating the cache and triggering wake_compute if the authentication failed. Additionally, I noticed some faulty logic I introduced 1 year ago https://github.com/neondatabase/neon/pull/8141/files#diff-5491e3afe62d8c5c77178149c665603b29d88d3ec2e47fc1b3bb119a0a970afaL145-R147 ☨ We can when we roll out TLS, as the certificate common name includes the compute-id. ☨☨ Technically password authentication could pass for the wrong compute, but I think this would only happen in the very very rare event that the IP got reused **and** the compute's endpoint happened to be a branch/replica. # Solution 1. Fix the broken logic 2. Simplify cache invalidation (I don't know why it was so convoluted) 3. Add a loop around connect_to_compute + authenticate to re-introduce the wake_compute invalidation we accidentally removed. I went with this approach to try and avoid interfering with https://github.com/neondatabase/neon/compare/main...cloneable/proxy-pglb-connect-compute-split. The changes made in commit 3 will move into `handle_client_request` I suspect, --- proxy/src/cache/timed_lru.rs | 63 ++++++++------------------ proxy/src/compute/mod.rs | 4 +- proxy/src/console_redirect_proxy.rs | 2 +- proxy/src/proxy/connect_compute.rs | 2 +- proxy/src/proxy/mod.rs | 69 +++++++++++++++++++++-------- proxy/src/proxy/retry.rs | 10 ++++- proxy/src/proxy/tests/mod.rs | 59 +++++++++++++++++++++--- 7 files changed, 137 insertions(+), 72 deletions(-) diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 7cfe5100ea..183e1ea449 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -30,7 +30,7 @@ use super::{Cache, timed_lru}; /// /// * There's an API for immediate invalidation (removal) of a cache entry; /// It's useful in case we know for sure that the entry is no longer correct. -/// See [`timed_lru::LookupInfo`] & [`timed_lru::Cached`] for more information. +/// See [`timed_lru::Cached`] for more information. /// /// * Expired entries are kept in the cache, until they are evicted by the LRU policy, /// or by a successful lookup (i.e. the entry hasn't expired yet). @@ -54,7 +54,7 @@ pub(crate) struct TimedLru { impl Cache for TimedLru { type Key = K; type Value = V; - type LookupInfo = LookupInfo; + type LookupInfo = Key; fn invalidate(&self, info: &Self::LookupInfo) { self.invalidate_raw(info); @@ -87,30 +87,24 @@ impl TimedLru { /// Drop an entry from the cache if it's outdated. #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] - fn invalidate_raw(&self, info: &LookupInfo) { - let now = Instant::now(); - + fn invalidate_raw(&self, key: &K) { // Do costly things before taking the lock. let mut cache = self.cache.lock(); - let raw_entry = match cache.raw_entry_mut().from_key(&info.key) { + let entry = match cache.raw_entry_mut().from_key(key) { RawEntryMut::Vacant(_) => return, - RawEntryMut::Occupied(x) => x, + RawEntryMut::Occupied(x) => x.remove(), }; - - // Remove the entry if it was created prior to lookup timestamp. - let entry = raw_entry.get(); - let (created_at, expires_at) = (entry.created_at, entry.expires_at); - let should_remove = created_at <= info.created_at || expires_at <= now; - - if should_remove { - raw_entry.remove(); - } - drop(cache); // drop lock before logging + + let Entry { + created_at, + expires_at, + .. + } = entry; + debug!( - created_at = format_args!("{created_at:?}"), - expires_at = format_args!("{expires_at:?}"), - entry_removed = should_remove, + ?created_at, + ?expires_at, "processed a cache entry invalidation event" ); } @@ -211,10 +205,10 @@ impl TimedLru { } pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { - let (created_at, old) = self.insert_raw(key.clone(), value); + let (_, old) = self.insert_raw(key.clone(), value); let cached = Cached { - token: Some((self, LookupInfo { created_at, key })), + token: Some((self, key)), value: (), }; @@ -229,28 +223,9 @@ impl TimedLru { K: Borrow + Clone, Q: Hash + Eq + ?Sized, { - self.get_raw(key, |key, entry| { - let info = LookupInfo { - created_at: entry.created_at, - key: key.clone(), - }; - - Cached { - token: Some((self, info)), - value: entry.value.clone(), - } + self.get_raw(key, |key, entry| Cached { + token: Some((self, key.clone())), + value: entry.value.clone(), }) } } - -/// Lookup information for key invalidation. -pub(crate) struct LookupInfo { - /// Time of creation of a cache [`Entry`]. - /// We use this during invalidation lookups to prevent eviction of a newer - /// entry sharing the same key (it might've been inserted by a different - /// task after we got the entry we're trying to invalidate now). - created_at: Instant, - - /// Search by this key. - key: K, -} diff --git a/proxy/src/compute/mod.rs b/proxy/src/compute/mod.rs index 7fb88e6a45..0a19090ce0 100644 --- a/proxy/src/compute/mod.rs +++ b/proxy/src/compute/mod.rs @@ -236,7 +236,7 @@ impl AuthInfo { &self, ctx: &RequestContext, compute: &mut ComputeConnection, - user_info: ComputeUserInfo, + user_info: &ComputeUserInfo, ) -> Result { // client config with stubbed connect info. // TODO(conrad): should we rewrite this to bypass tokio-postgres2 entirely, @@ -272,7 +272,7 @@ impl AuthInfo { secret_key, }, compute.hostname.to_string(), - user_info, + user_info.clone(), ); Ok(PostgresSettings { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 112465a89b..d5903286a0 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -226,7 +226,7 @@ pub(crate) async fn handle_client( .await?; let pg_settings = auth_info - .authenticate(ctx, &mut node, user_info) + .authenticate(ctx, &mut node, &user_info) .or_else(|e| async { Err(stream.throw_error(e, Some(ctx)).await) }) .await?; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index aa675a439e..9f642f52ab 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -112,7 +112,7 @@ where let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. - if should_retry(&err, num_retries, compute.retry) { + if !should_retry(&err, num_retries, compute.retry) { Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 6b84e47982..d9c0585efb 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -18,9 +18,11 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, warn}; +use crate::cache::Cache; use crate::cancellation::{self, CancellationHandler}; use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; use crate::context::RequestContext; +use crate::control_plane::client::ControlPlaneClient; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumClientConnectionsGuard}; pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; @@ -29,6 +31,7 @@ use crate::pglb::passthrough::ProxyPassthrough; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; +use crate::proxy::retry::ShouldRetryWakeCompute; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; @@ -349,26 +352,56 @@ pub(crate) async fn handle_client( let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys); auth_info.set_startup_params(¶ms, params_compat); - let res = connect_to_compute( - ctx, - &TcpMechanism { - locks: &config.connect_compute_locks, - }, - &auth::Backend::ControlPlane(cplane, creds.info.clone()), - config.wake_compute_retry_config, - &config.connect_to_compute, - ) - .await; - - let mut node = match res { - Ok(node) => node, - Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + let mut node; + let mut attempt = 0; + let connect = TcpMechanism { + locks: &config.connect_compute_locks, }; + let backend = auth::Backend::ControlPlane(cplane, creds.info); - let pg_settings = auth_info.authenticate(ctx, &mut node, creds.info).await; - let pg_settings = match pg_settings { - Ok(pg_settings) => pg_settings, - Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + // NOTE: This is messy, but should hopefully be detangled with PGLB. + // We wanted to separate the concerns of **connect** to compute (a PGLB operation), + // from **authenticate** to compute (a NeonKeeper operation). + // + // This unfortunately removed retry handling for one error case where + // the compute was cached, and we connected, but the compute cache was actually stale + // and is associated with the wrong endpoint. We detect this when the **authentication** fails. + // As such, we retry once here if the `authenticate` function fails and the error is valid to retry. + let pg_settings = loop { + attempt += 1; + + let res = connect_to_compute( + ctx, + &connect, + &backend, + config.wake_compute_retry_config, + &config.connect_to_compute, + ) + .await; + + match res { + Ok(n) => node = n, + Err(e) => return Err(stream.throw_error(e, Some(ctx)).await)?, + } + + let auth::Backend::ControlPlane(cplane, user_info) = &backend else { + unreachable!("ensured above"); + }; + + let res = auth_info.authenticate(ctx, &mut node, user_info).await; + match res { + Ok(pg_settings) => break pg_settings, + Err(e) if attempt < 2 && e.should_retry_wake_compute() => { + tracing::warn!(error = ?e, "retrying wake compute"); + + #[allow(irrefutable_let_patterns)] + if let ControlPlaneClient::ProxyV1(cplane_proxy_v1) = &**cplane { + let key = user_info.endpoint_cache_key(); + cplane_proxy_v1.caches.node_info.invalidate(&key); + } + } + Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + } }; let session = cancellation_handler.get_key(); diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index e9eca95724..b06c3be72c 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -3,7 +3,7 @@ use std::io; use tokio::time; -use crate::compute; +use crate::compute::{self, PostgresError}; use crate::config::RetryConfig; pub(crate) trait CouldRetry { @@ -115,6 +115,14 @@ impl ShouldRetryWakeCompute for compute::ConnectionError { } } +impl ShouldRetryWakeCompute for PostgresError { + fn should_retry_wake_compute(&self) -> bool { + match self { + PostgresError::Postgres(error) => error.should_retry_wake_compute(), + } + } +} + pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration { config .base_delay diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 29a269208a..4f27496019 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -374,6 +374,7 @@ fn connect_compute_total_wait() { #[derive(Clone, Copy, Debug)] enum ConnectAction { Wake, + WakeCold, WakeFail, WakeRetry, Connect, @@ -504,6 +505,9 @@ impl TestControlPlaneClient for TestConnectMechanism { *counter += 1; match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), + ConnectAction::WakeCold => Ok(CachedNodeInfo::new_uncached( + helper_create_uncached_node_info(), + )), ConnectAction::WakeFail => { let err = control_plane::errors::ControlPlaneError::Message(Box::new( ControlPlaneErrorMessage { @@ -551,8 +555,8 @@ impl TestControlPlaneClient for TestConnectMechanism { } } -fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { - let node = NodeInfo { +fn helper_create_uncached_node_info() -> NodeInfo { + NodeInfo { conn_info: compute::ConnectInfo { host: "test".into(), port: 5432, @@ -566,7 +570,11 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn compute_id: "compute".into(), cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm, }, - }; + } +} + +fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { + let node = helper_create_uncached_node_info(); let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); node2.map(|()| node) } @@ -742,7 +750,7 @@ async fn fail_no_wake_skips_cache_invalidation() { let ctx = RequestContext::test(); let mech = TestConnectMechanism::new(vec![ ConnectAction::Wake, - ConnectAction::FailNoWake, + ConnectAction::RetryNoWake, ConnectAction::Connect, ]); let user = helper_create_connect_info(&mech); @@ -788,7 +796,7 @@ async fn retry_no_wake_skips_invalidation() { let ctx = RequestContext::test(); // Wake → RetryNoWake (retryable + NOT wakeable) - let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]); + let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake, Fail]); let user_info = helper_create_connect_info(&mechanism); let cfg = config(); @@ -802,3 +810,44 @@ async fn retry_no_wake_skips_invalidation() { "invalidating stalled compute node info cache entry" )); } + +#[tokio::test] +#[traced_test] +async fn retry_no_wake_error_fast() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // Wake → FailNoWake (not retryable + NOT wakeable) + let mechanism = TestConnectMechanism::new(vec![Wake, FailNoWake]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap_err(); + mechanism.verify(); + + // Because FailNoWake has wakeable=false, we must NOT see invalidate_cache + assert!(!logs_contain( + "invalidating stalled compute node info cache entry" + )); +} + +#[tokio::test] +#[traced_test] +async fn retry_cold_wake_skips_invalidation() { + let _ = env_logger::try_init(); + use ConnectAction::*; + + let ctx = RequestContext::test(); + // WakeCold → FailNoWake (not retryable + NOT wakeable) + let mechanism = TestConnectMechanism::new(vec![WakeCold, Retry, Connect]); + let user_info = helper_create_connect_info(&mechanism); + let cfg = config(); + + connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg) + .await + .unwrap(); + mechanism.verify(); +} From ebc12a388c74911d2a67805e00af9348ce0b5e1d Mon Sep 17 00:00:00 2001 From: Mikhail Date: Fri, 27 Jun 2025 12:06:27 +0100 Subject: [PATCH 31/50] fix: endpoint_storage_addr as String (#12359) It's not a SocketAddr as we use k8s DNS https://github.com/neondatabase/cloud/issues/19011 --- compute_tools/src/compute.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 70b2d28bf2..cf558ee01a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,7 +20,6 @@ use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; use std::collections::{HashMap, HashSet}; -use std::net::SocketAddr; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; @@ -218,7 +217,8 @@ pub struct ParsedSpec { pub pageserver_connstr: String, pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, - pub endpoint_storage_addr: Option, + /// k8s dns name and port + pub endpoint_storage_addr: Option, pub endpoint_storage_token: Option, } @@ -313,13 +313,10 @@ impl TryFrom for ParsedSpec { .or(Err("invalid timeline id"))? }; - let endpoint_storage_addr: Option = spec + let endpoint_storage_addr: Option = spec .endpoint_storage_addr .clone() - .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")) - .unwrap_or_default() - .parse() - .ok(); + .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr")); let endpoint_storage_token = spec .endpoint_storage_token .clone() From ebb6e26a64b368f865bed420de274d23d46e0caa Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 27 Jun 2025 13:46:18 +0100 Subject: [PATCH 32/50] pageserver: handle multiple attached children in shard resolution (#12336) ## Problem When resolving a shard during a split we might have multiple attached shards with the old shard count (i.e. not all of them are marked in progress and ignored). Hence, we can compute the desired shard number based on the old shard count and misroute the request. ## Summary of Changes Recompute the desired shard every time the shard count changes during the iteration --- pageserver/src/tenant/mgr.rs | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 0a494e7923..248d92622e 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2200,7 +2200,7 @@ impl TenantManager { selector: ShardSelector, ) -> ShardResolveResult { let tenants = self.tenants.read().unwrap(); - let mut want_shard = None; + let mut want_shard: Option = None; let mut any_in_progress = None; match &*tenants { @@ -2225,14 +2225,23 @@ impl TenantManager { return ShardResolveResult::Found(tenant.clone()); } ShardSelector::Page(key) => { - // First slot we see for this tenant, calculate the expected shard number - // for the key: we will use this for checking if this and subsequent - // slots contain the key, rather than recalculating the hash each time. - if want_shard.is_none() { - want_shard = Some(tenant.shard_identity.get_shard_number(&key)); + // Each time we find an attached slot with a different shard count, + // recompute the expected shard number: during shard splits we might + // have multiple shards with the old shard count. + if want_shard.is_none() + || want_shard.unwrap().shard_count != tenant.shard_identity.count + { + want_shard = Some(ShardIndex { + shard_number: tenant.shard_identity.get_shard_number(&key), + shard_count: tenant.shard_identity.count, + }); } - if Some(tenant.shard_identity.number) == want_shard { + if Some(ShardIndex { + shard_number: tenant.shard_identity.number, + shard_count: tenant.shard_identity.count, + }) == want_shard + { return ShardResolveResult::Found(tenant.clone()); } } From cc1664ef93171774b5b1d127ab9f5a978cf62f99 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 27 Jun 2025 14:13:11 +0100 Subject: [PATCH 33/50] pageserver: allow flush task cancelled error in sharding autosplit test (#12374) ## Problem Test is failing due to compaction shutdown noise (see https://github.com/neondatabase/neon/issues/12162). ## Summary of changes Allow list the noise. --- test_runner/performance/test_sharding_autosplit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index 04bebae92f..0bb210db23 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -62,7 +62,8 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): ps.allowed_errors.extend( [ # We shut down pageservers while they might have some compaction work going on - ".*Compaction failed.*shutting down.*" + ".*Compaction failed.*shutting down.*", + ".*flush task cancelled.*", ] ) From 6f4198c78a93958ee02301b26ada27c9e91be494 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Fri, 27 Jun 2025 15:49:26 +0200 Subject: [PATCH 34/50] treat strategy flag test_maintenance as boolean data type (#12373) ## Problem In large oltp test run https://github.com/neondatabase/neon/actions/runs/15905488707/job/44859116742 we see that the `Benchmark database maintenance` step is skipped in all 3 strategy variants, however it should be executed in two. This is due to treating the `test_maintenance` boolean type in the strategy in the condition of the `Benchmark database maintenance` step ## Summary of changes Use a boolean condition instead of a string comparison ## Test run from this pull request branch https://github.com/neondatabase/neon/actions/runs/15923605412 --- .github/workflows/large_oltp_benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index 050b9047c7..4f6858dcae 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -153,7 +153,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - name: Benchmark database maintenance - if: ${{ matrix.test_maintenance == 'true' }} + if: ${{ matrix.test_maintenance }} uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} From 37e181af8ae27963c02f6312e23362c395a84404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 27 Jun 2025 15:51:59 +0200 Subject: [PATCH 35/50] Update rust to 1.88.0 (#12364) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. [Announcement blog post](https://blog.rust-lang.org/2025/06/26/Rust-1.88.0/) Prior update was in https://github.com/neondatabase/neon/pull/11938 --- build-tools.Dockerfile | 4 ++-- pageserver/src/bin/test_helper_slow_client_reads.rs | 2 +- pageserver/src/tenant.rs | 4 ++-- rust-toolchain.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index f97f04968e..b70ced7886 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -179,7 +179,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ && mv s5cmd /usr/local/bin/s5cmd # LLVM -ENV LLVM_VERSION=19 +ENV LLVM_VERSION=20 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.87.0 +ENV RUSTC_VERSION=1.88.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs index be8e081945..8b641abee7 100644 --- a/pageserver/src/bin/test_helper_slow_client_reads.rs +++ b/pageserver/src/bin/test_helper_slow_client_reads.rs @@ -37,7 +37,7 @@ async fn main() -> anyhow::Result<()> { not_modified_since: Lsn(23), }, batch_key: 42, - message: format!("message {}", msg), + message: format!("message {msg}"), })); let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else { eprintln!("pipe seems full"); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2613528143..2e9dbdc539 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -11429,11 +11429,11 @@ mod tests { if left != right { eprintln!("---LEFT---"); for left in left.iter() { - eprintln!("{}", left); + eprintln!("{left}"); } eprintln!("---RIGHT---"); for right in right.iter() { - eprintln!("{}", right); + eprintln!("{right}"); } assert_eq!(left, right); } diff --git a/rust-toolchain.toml b/rust-toolchain.toml index c48def3483..d20b46e755 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.87.0" +channel = "1.88.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From 5a82182c48ffc7d7c50ac44e33d8d0e7353dc65a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 27 Jun 2025 17:49:52 +0300 Subject: [PATCH 36/50] impr(ci): Refactor postgres Makefile targets to a separate makefile (#12363) Mainly for general readability. Some notable changes: - Postgres can be built without the rest of the repository, and in particular without any of the Rust bits. Some CI scripts took advantage of that, so let's make that more explicit by separating those parts. Also add an explicit comment about that in the new postgres.mk file. - Add a new PG_INSTALL_CACHED variable. If it's set, `make all` and other top-Makefile targets skip checking if Postgres is up-to-date. This is also to be used in CI scripts that build and cache Postgres as separate steps. (It is currently only used in the macos walproposer-lib rule, but stay tuned for more.) - Introduce a POSTGRES_VERSIONS variable that lists all supported PostgreSQL versions. Refactor a few Makefile rules to use that. --- .dockerignore | 1 + .github/workflows/build-macos.yml | 8 +- Dockerfile | 1 + Makefile | 133 +++++++----------------------- postgres.mk | 121 +++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 106 deletions(-) create mode 100644 postgres.mk diff --git a/.dockerignore b/.dockerignore index 4bf1492ea3..4d9433764e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,6 +4,7 @@ !Cargo.lock !Cargo.toml !Makefile +!postgres.mk !rust-toolchain.toml !scripts/ninstall.sh !docker-compose/run-tests.sh diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 7fd2626332..7b2c9c2ce3 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -135,6 +135,12 @@ jobs: name: pg_install--v17 path: pg_install/v17 + # `actions/download-artifact` doesn't preserve permissions: + # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss + - name: Make pg_install/v*/bin/* executable + run: | + chmod +x pg_install/v*/bin/* + - name: Cache walproposer-lib id: cache_walproposer_lib uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 @@ -162,7 +168,7 @@ jobs: - name: Build walproposer-lib (only for v17) if: steps.cache_walproposer_lib.outputs.cache-hit != 'true' run: - make walproposer-lib -j$(sysctl -n hw.ncpu) + make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1 - name: Upload "build/walproposer-lib" artifact uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 diff --git a/Dockerfile b/Dockerfile index 69657067de..d518370ab8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,6 +40,7 @@ COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17 COPY --chown=nonroot pgxn pgxn COPY --chown=nonroot Makefile Makefile +COPY --chown=nonroot postgres.mk postgres.mk COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh ENV BUILD_TYPE=release diff --git a/Makefile b/Makefile index a43411518d..7f8f436a2e 100644 --- a/Makefile +++ b/Makefile @@ -4,11 +4,14 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) # managers. POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/ +# Supported PostgreSQL versions +POSTGRES_VERSIONS = v17 v16 v15 v14 + # CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked` # and `--features testing` are popular examples. # -# CARGO_PROFILE: You can also set to override the cargo profile to -# use. By default, it is derived from BUILD_TYPE. +# CARGO_PROFILE: Set to override the cargo profile to use. By default, +# it is derived from BUILD_TYPE. # All intermediate build artifacts are stored here. BUILD_DIR := build @@ -95,95 +98,24 @@ CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55" # Top level Makefile to build Neon and PostgreSQL # .PHONY: all -all: neon postgres neon-pg-ext +all: neon postgres-install neon-pg-ext ### Neon Rust bits # # The 'postgres_ffi' depends on the Postgres headers. .PHONY: neon -neon: postgres-headers walproposer-lib cargo-target-dir +neon: postgres-headers-install walproposer-lib cargo-target-dir +@echo "Compiling Neon" $(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE) + .PHONY: cargo-target-dir cargo-target-dir: # https://github.com/rust-lang/cargo/issues/14281 mkdir -p target test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG -### PostgreSQL parts -# Some rules are duplicated for Postgres v14 and 15. We may want to refactor -# to avoid the duplication in the future, but it's tolerable for now. -# -$(BUILD_DIR)/%/config.status: - mkdir -p $(BUILD_DIR) - test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG - - +@echo "Configuring Postgres $* build" - @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ - echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ - echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ - exit 1; } - mkdir -p $(BUILD_DIR)/$* - - VERSION=$*; \ - EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ - (cd $(BUILD_DIR)/$$VERSION && \ - env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ - CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ - $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ - --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) - -# nicer alias to run 'configure' -# Note: I've been unable to use templates for this part of our configuration. -# I'm not sure why it wouldn't work, but this is the only place (apart from -# the "build-all-versions" entry points) where direct mention of PostgreSQL -# versions is used. -.PHONY: postgres-configure-v17 -postgres-configure-v17: $(BUILD_DIR)/v17/config.status -.PHONY: postgres-configure-v16 -postgres-configure-v16: $(BUILD_DIR)/v16/config.status -.PHONY: postgres-configure-v15 -postgres-configure-v15: $(BUILD_DIR)/v15/config.status -.PHONY: postgres-configure-v14 -postgres-configure-v14: $(BUILD_DIR)/v14/config.status - -# Install just the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include -# -# This is implicitly included in the 'postgres-%' rule, but this can be handy if you -# want to just install the headers without building PostgreSQL, e.g. for building -# extensions. -.PHONY: postgres-headers-% -postgres-headers-%: postgres-configure-% - +@echo "Installing PostgreSQL $* headers" - $(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install - -# Compile and install PostgreSQL -.PHONY: postgres-% -postgres-%: postgres-configure-% \ - postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers` - +@echo "Compiling PostgreSQL $*" - $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install - +@echo "Compiling pg_prewarm $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install - +@echo "Compiling pg_buffercache $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install - +@echo "Compiling pg_visibility $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install - +@echo "Compiling pageinspect $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install - +@echo "Compiling pg_trgm $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install - +@echo "Compiling amcheck $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install - +@echo "Compiling test_decoding $*" - $(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install - -.PHONY: postgres-check-% -postgres-check-%: postgres-% - $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check - .PHONY: neon-pg-ext-% -neon-pg-ext-%: postgres-% +neon-pg-ext-%: postgres-install-% +@echo "Compiling neon-specific Postgres extensions for $*" mkdir -p $(BUILD_DIR)/pgxn-$* $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \ @@ -222,39 +154,14 @@ ifeq ($(UNAME_S),Linux) pg_crc32c.o endif +# Shorthand to call neon-pg-ext-% target for all Postgres versions .PHONY: neon-pg-ext -neon-pg-ext: \ - neon-pg-ext-v14 \ - neon-pg-ext-v15 \ - neon-pg-ext-v16 \ - neon-pg-ext-v17 - -# shorthand to build all Postgres versions -.PHONY: postgres -postgres: \ - postgres-v14 \ - postgres-v15 \ - postgres-v16 \ - postgres-v17 - -.PHONY: postgres-headers -postgres-headers: \ - postgres-headers-v14 \ - postgres-headers-v15 \ - postgres-headers-v16 \ - postgres-headers-v17 - -.PHONY: postgres-check -postgres-check: \ - postgres-check-v14 \ - postgres-check-v15 \ - postgres-check-v16 \ - postgres-check-v17 +neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version)) # This removes everything .PHONY: distclean distclean: - $(RM) -r $(POSTGRES_INSTALL_DIR) + $(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR) $(CARGO_CMD_PREFIX) cargo clean .PHONY: fmt @@ -302,3 +209,19 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17 .PHONY: setup-pre-commit-hook setup-pre-commit-hook: ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit + +# Targets for building PostgreSQL are defined in postgres.mk. +# +# But if the caller has indicated that PostgreSQL is already +# installed, by setting the PG_INSTALL_CACHED variable, skip it. +ifdef PG_INSTALL_CACHED +postgres-install: skip-install +$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install +postgres-headers-install: + +@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set" +skip-install: + +@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set" + +else +include postgres.mk +endif diff --git a/postgres.mk b/postgres.mk new file mode 100644 index 0000000000..eff882d1ff --- /dev/null +++ b/postgres.mk @@ -0,0 +1,121 @@ +# Sub-makefile for compiling PostgreSQL as part of Neon. This is +# included from the main Makefile, and is not meant to be called +# directly. +# +# CI workflows and Dockerfiles can take advantage of the following +# properties for caching: +# +# - Compiling the targets in this file only builds the PostgreSQL sources +# under the vendor/ subdirectory, nothing else from the repository. +# - All outputs go to POSTGRES_INSTALL_DIR (by default 'pg_install', +# see parent Makefile) +# - intermediate build artifacts go to BUILD_DIR +# +# +# Variables passed from the parent Makefile that control what gets +# installed and where: +# - POSTGRES_VERSIONS +# - POSTGRES_INSTALL_DIR +# - BUILD_DIR +# +# Variables passed from the parent Makefile that affect the build +# process and the resulting binaries: +# - PG_CONFIGURE_OPTS +# - PG_CFLAGS +# - PG_LDFLAGS +# - EXTRA_PATH_OVERRIDES + +### +### Main targets +### +### These are called from the main Makefile, and can also be called +### directly from command line + +# Compile and install a specific PostgreSQL version +postgres-install-%: postgres-configure-% \ + postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers` + +# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)//include +# +# This is implicitly part of the 'postgres-install-%' target, but this can be handy +# if you want to install just the headers without building PostgreSQL, e.g. for building +# extensions. +postgres-headers-install-%: postgres-configure-% + +@echo "Installing PostgreSQL $* headers" + $(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install + +# Run Postgres regression tests +postgres-check-%: postgres-install-% + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check + +### +### Shorthands for the main targets, for convenience +### + +# Same as the above main targets, but for all supported PostgreSQL versions +# For example, 'make postgres-install' is equivalent to +# 'make postgres-install-v14 postgres-install-v15 postgres-install-v16 postgres-install-v17' +all_version_targets=postgres-install postgres-headers-install postgres-check +.PHONY: $(all_version_targets) +$(all_version_targets): postgres-%: $(foreach pg_version,$(POSTGRES_VERSIONS),postgres-%-$(pg_version)) + +.PHONY: postgres +postgres: postgres-install + +.PHONY: postgres-headers +postgres-headers: postgres-headers-install + +# 'postgres-v17' is an alias for 'postgres-install-v17' etc. +$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-$(pg_version)): postgres-%: postgres-install-% + +### +### Intermediate targets +### +### These are not intended to be called directly, but are dependencies for the +### main targets. + +# Run 'configure' +$(BUILD_DIR)/%/config.status: + mkdir -p $(BUILD_DIR) + test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG + + +@echo "Configuring Postgres $* build" + @test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \ + echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \ + echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \ + exit 1; } + mkdir -p $(BUILD_DIR)/$* + + VERSION=$*; \ + EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \ + (cd $(BUILD_DIR)/$$VERSION && \ + env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \ + CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \ + $(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \ + --prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log) + +# nicer alias to run 'configure'. +# +# This tries to accomplish this rule: +# +# postgres-configure-%: $(BUILD_DIR)/%/config.status +# +# XXX: I'm not sure why the above rule doesn't work directly. But this accomplishses +# the same thing +$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-configure-$(pg_version)): postgres-configure-%: FORCE $(BUILD_DIR)/%/config.status + +# Compile and install PostgreSQL (and a few contrib modules used in tests) +postgres-install-%: postgres-configure-% \ + postgres-headers-install-% # to prevent `make install` conflicts with neon's `postgres-headers-install` + +@echo "Compiling PostgreSQL $*" + $(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install + $(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install + +.PHONY: FORCE +FORCE: From 4c7956fa56e8b39e56a83342c72c774481dba295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 27 Jun 2025 17:14:55 +0200 Subject: [PATCH 37/50] Fix hang deleting offloaded timelines (#12366) We don't have cancellation support for timeline deletions. In other words, timeline deletion might still go on in an older generation while we are attaching it in a newer generation already, because the cancellation simply hasn't reached the deletion code. This has caused us to hit a situation with offloaded timelines in which the timeline was in an unrecoverable state: always returning an accepted response, but never a 404 like it should be. The detailed description can be found in [here](https://github.com/neondatabase/cloud/issues/30406#issuecomment-3008667859) (private repo link). TLDR: 1. we ask to delete timeline on old pageserver/generation, starts process in background 2. the storcon migrates the tenant to a different pageserver. - during attach, the pageserver still finds an index part, so it adds it to `offloaded_timelines` 4. the timeline deletion finishes, removing the index part in S3 5. there is a retry of the timeline deletion endpoint, sent to the new pageserver location. it is bound to fail however: - as the index part is gone, we print `Timeline already deleted in remote storage`. - the problem is that we then return an accepted response code, and not a 404. - this confuses the code calling us. it thinks the timeline is not deleted, so keeps retrying. - this state never gets recovered from until a reset/detach, because of the `offloaded_timelines` entry staying there. This is where this PR fixes things: if no index part can be found, we can safely assume that the timeline is gone in S3 (it's the last thing to be deleted), so we can remove it from `offloaded_timelines` and trigger a reupload of the manifest. Subsequent retries will pick that up. Why not improve the cancellation support? It is a more disruptive code change, that might have its own risks. So we don't do it for now. Fixes https://github.com/neondatabase/cloud/issues/30406 --- pageserver/src/tenant/timeline/delete.rs | 11 +- test_runner/regress/test_timeline_archive.py | 128 +++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 51bdd59f4f..f7dc44be90 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -241,8 +241,17 @@ impl DeleteTimelineFlow { { Ok(r) => r, Err(DownloadError::NotFound) => { - // Deletion is already complete + // Deletion is already complete. + // As we came here, we will need to remove the timeline from the tenant though. tracing::info!("Timeline already deleted in remote storage"); + if let TimelineOrOffloaded::Offloaded(_) = &timeline { + // We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in. + tracing::info!( + "Timeline with gone index part is offloaded timeline. Removing from tenant." + ); + remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard) + .await?; + } return Ok(()); } Err(e) => { diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 8d46ef8306..41286a2adc 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -896,6 +896,134 @@ def test_timeline_retain_lsn( assert sum == pre_branch_sum +def test_timeline_offload_delete_race(neon_env_builder: NeonEnvBuilder): + """ + Regression test for https://github.com/neondatabase/cloud/issues/30406 + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + neon_env_builder.num_pageservers = 2 + + env = neon_env_builder.init_start() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024**2}", + } + ) + + origin_ps = env.get_tenant_pageserver(tenant_id) + assert origin_ps + origin_ps.allowed_errors.extend( + [ + ".*Timed out waiting for deletion queue flush.*", + ".*Timed out waiting for flush to remote storage.*", + ] + ) + origin_ps_http = origin_ps.http_client() + + # We are not sharding this tenant + tenant_shard_id = TenantShardId(tenant_id, 0, 0) + + # Create a branch and archive it + child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id) + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,512)", + ] + ) + last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + origin_ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + return any( + timeline["timeline_id"] == str(timeline_id) + for timeline in origin_ps_http.timeline_and_offloaded_list( + tenant_id=tenant_id + ).offloaded + ) + + def child_offloaded(): + origin_ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) + assert timeline_offloaded_api(child_timeline_id) + + wait_until(child_offloaded) + + # Delete the timeline from the origin pageserver, holding up the deletion queue so that it doesn't finish + failpoint_deletion_queue = "deletion-queue-before-execute-pause" + origin_ps_http.configure_failpoints((failpoint_deletion_queue, "pause")) + origin_ps_http.timeline_delete(tenant_id, child_timeline_id) + + dest_ps = [ps for ps in env.pageservers if ps.id != origin_ps.id][0] + assert dest_ps + log.info(f"Migrating {tenant_id} {origin_ps.id}->{dest_ps.id}") + env.storage_controller.tenant_shard_migrate(tenant_shard_id, dest_ps_id=dest_ps.id) + + log.info("unstuck the DELETE") + origin_ps_http.configure_failpoints((failpoint_deletion_queue, "off")) + + def child_prefix_empty(): + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/{str(child_timeline_id)}/", + ) + + wait_until(child_prefix_empty) + + dest_ps_http = dest_ps.http_client() + + # We can't use timeline_delete_wait_completed here as timeline status will return 404, but we want to return 404 from the deletion endpoint + def timeline_is_missing(): + data = None + try: + data = dest_ps_http.timeline_delete(tenant_id, child_timeline_id) + log.info(f"timeline delete {data}") + except PageserverApiException as e: + log.debug(e) + if e.status_code == 404: + return + + raise RuntimeError(f"Timeline exists {data}") + + wait_until(timeline_is_missing) + # (dest_ps_http, tenant_id, child_timeline_id) + + # + # Now ensure that scrubber doesn't have anything to clean up. + # + + # Sleep some amount larger than min_age_secs + time.sleep(3) + + # Ensure that min_age_secs has a deletion impeding effect + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["tenant_manifests_deleted"] == 0 + + def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): """ Test for scrubber deleting old generations of manifests From 0ee15002fc3ef3927b3c2c7acf33f5c410dbad3d Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Fri, 27 Jun 2025 17:20:23 +0200 Subject: [PATCH 38/50] proxy: Move client connection accept and handshake to pglb (#12380) * This must be a no-op. * Move proxy::task_main to pglb::task_main. * Move client accept, TLS and handshake to pglb. * Keep auth and wake in proxy. --- proxy/src/binary/pg_sni_router.rs | 3 +- proxy/src/binary/proxy.rs | 2 +- proxy/src/console_redirect_proxy.rs | 5 +- proxy/src/pglb/handshake.rs | 2 +- proxy/src/pglb/mod.rs | 329 ++++++++++++++++++++++++++ proxy/src/proxy/mod.rs | 346 +++------------------------- proxy/src/proxy/tests/mitm.rs | 3 + proxy/src/proxy/tests/mod.rs | 20 +- proxy/src/serverless/websocket.rs | 5 +- 9 files changed, 388 insertions(+), 327 deletions(-) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 070c73cdcf..b877aaddef 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -26,9 +26,10 @@ use utils::sentry_init::init_sentry; use crate::context::RequestContext; use crate::metrics::{Metrics, ThreadPoolMetrics}; +use crate::pglb::TlsRequired; use crate::pqproto::FeStartupPacket; use crate::protocol2::ConnectionInfo; -use crate::proxy::{ErrorSource, TlsRequired, copy_bidirectional_client_compute}; +use crate::proxy::{ErrorSource, copy_bidirectional_client_compute}; use crate::stream::{PqStream, Stream}; use crate::util::run_until_cancelled; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 9ead05d492..2133f33a4d 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -392,7 +392,7 @@ pub async fn run() -> anyhow::Result<()> { match auth_backend { Either::Left(auth_backend) => { if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(crate::proxy::task_main( + client_tasks.spawn(crate::pglb::task_main( config, auth_backend, proxy_listener, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index d5903286a0..041a56e032 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -11,11 +11,12 @@ use crate::config::{ProxyConfig, ProxyProtocolV2}; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::pglb::ClientRequestError; use crate::pglb::handshake::{HandshakeData, handshake}; use crate::pglb::passthrough::ProxyPassthrough; use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol}; use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; -use crate::proxy::{ClientRequestError, ErrorSource, prepare_client_connection}; +use crate::proxy::{ErrorSource, finish_client_init}; use crate::util::run_until_cancelled; pub async fn task_main( @@ -232,7 +233,7 @@ pub(crate) async fn handle_client( let session = cancellation_handler.get_key(); - prepare_client_connection(&pg_settings, *session.key(), &mut stream); + finish_client_init(&pg_settings, *session.key(), &mut stream); let stream = stream.flush_and_into_inner().await?; let session_id = ctx.session_id(); diff --git a/proxy/src/pglb/handshake.rs b/proxy/src/pglb/handshake.rs index 6970ab8714..25a2d01b4a 100644 --- a/proxy/src/pglb/handshake.rs +++ b/proxy/src/pglb/handshake.rs @@ -8,10 +8,10 @@ use crate::config::TlsConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; +use crate::pglb::TlsRequired; use crate::pqproto::{ BeMessage, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; -use crate::proxy::TlsRequired; use crate::stream::{PqStream, Stream, StreamUpgradeError}; use crate::tls::PG_ALPN_PROTOCOL; diff --git a/proxy/src/pglb/mod.rs b/proxy/src/pglb/mod.rs index cb82524cf6..c4cab155c5 100644 --- a/proxy/src/pglb/mod.rs +++ b/proxy/src/pglb/mod.rs @@ -2,3 +2,332 @@ pub mod copy_bidirectional; pub mod handshake; pub mod inprocess; pub mod passthrough; + +use std::sync::Arc; + +use futures::FutureExt; +use smol_str::ToSmolStr; +use thiserror::Error; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_util::sync::CancellationToken; +use tracing::{Instrument, debug, error, info, warn}; + +use crate::auth; +use crate::cancellation::{self, CancellationHandler}; +use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::context::RequestContext; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +pub use crate::pglb::copy_bidirectional::ErrorSource; +use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; +use crate::pglb::passthrough::ProxyPassthrough; +use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; +use crate::proxy::handle_client; +use crate::rate_limiter::EndpointRateLimiter; +use crate::stream::Stream; +use crate::util::run_until_cancelled; + +pub const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; + +#[derive(Error, Debug)] +#[error("{ERR_INSECURE_CONNECTION}")] +pub struct TlsRequired; + +impl ReportableError for TlsRequired { + fn get_error_kind(&self) -> crate::error::ErrorKind { + crate::error::ErrorKind::User + } +} + +impl UserFacingError for TlsRequired {} + +pub async fn task_main( + config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, ()>, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, + cancellation_handler: Arc, + endpoint_rate_limiter: Arc, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("proxy has shut down"); + } + + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + + let session_id = uuid::Uuid::new_v4(); + let cancellation_handler = Arc::clone(&cancellation_handler); + let cancellations = cancellations.clone(); + + debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); + let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); + + connections.spawn(async move { + let (socket, conn_info) = match config.proxy_protocol_v2 { + ProxyProtocolV2::Required => { + match read_proxy_protocol(socket).await { + Err(e) => { + warn!("per-client task finished with an error: {e:#}"); + return; + } + // our load balancers will not send any more data. let's just exit immediately + Ok((_socket, ConnectHeader::Local)) => { + debug!("healthcheck received"); + return; + } + Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), + } + } + // ignore the header - it cannot be confused for a postgres or http connection so will + // error later. + ProxyProtocolV2::Rejected => ( + socket, + ConnectionInfo { + addr: peer_addr, + extra: None, + }, + ), + }; + + match socket.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!( + "per-client task finished with an error: failed to set socket option: {e:#}" + ); + return; + } + } + + let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); + + let res = handle_connection( + config, + auth_backend, + &ctx, + cancellation_handler, + socket, + ClientMode::Tcp, + endpoint_rate_limiter2, + conn_gauge, + cancellations, + ) + .instrument(ctx.span()) + .boxed() + .await; + + match res { + Err(e) => { + ctx.set_error_kind(e.get_error_kind()); + warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + let _disconnect = ctx.log_connect(); + match p.proxy_pass().await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + warn!( + ?session_id, + "per-client task finished with an IO error from the client: {e:#}" + ); + } + Err(ErrorSource::Compute(e)) => { + error!( + ?session_id, + "per-client task finished with an IO error from the compute: {e:#}" + ); + } + } + } + } + }); + } + + connections.close(); + cancellations.close(); + drop(listener); + + // Drain connections + connections.wait().await; + cancellations.wait().await; + + Ok(()) +} + +pub(crate) enum ClientMode { + Tcp, + Websockets { hostname: Option }, +} + +/// Abstracts the logic of handling TCP vs WS clients +impl ClientMode { + pub fn allow_cleartext(&self) -> bool { + match self { + ClientMode::Tcp => false, + ClientMode::Websockets { .. } => true, + } + } + + pub fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { + match self { + ClientMode::Tcp => s.sni_hostname(), + ClientMode::Websockets { hostname } => hostname.as_deref(), + } + } + + pub fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> { + match self { + ClientMode::Tcp => tls, + // TLS is None here if using websockets, because the connection is already encrypted. + ClientMode::Websockets { .. } => None, + } + } +} + +#[derive(Debug, Error)] +// almost all errors should be reported to the user, but there's a few cases where we cannot +// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons +// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, +// we cannot be sure the client even understands our error message +// 3. PrepareClient: The client disconnected, so we can't tell them anyway... +pub(crate) enum ClientRequestError { + #[error("{0}")] + Cancellation(#[from] cancellation::CancelError), + #[error("{0}")] + Handshake(#[from] HandshakeError), + #[error("{0}")] + HandshakeTimeout(#[from] tokio::time::error::Elapsed), + #[error("{0}")] + PrepareClient(#[from] std::io::Error), + #[error("{0}")] + ReportedError(#[from] crate::stream::ReportedError), +} + +impl ReportableError for ClientRequestError { + fn get_error_kind(&self) -> crate::error::ErrorKind { + match self { + ClientRequestError::Cancellation(e) => e.get_error_kind(), + ClientRequestError::Handshake(e) => e.get_error_kind(), + ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, + ClientRequestError::ReportedError(e) => e.get_error_kind(), + ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, + } + } +} + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn handle_connection( + config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, ()>, + ctx: &RequestContext, + cancellation_handler: Arc, + client: S, + mode: ClientMode, + endpoint_rate_limiter: Arc, + conn_gauge: NumClientConnectionsGuard<'static>, + cancellations: tokio_util::task::task_tracker::TaskTracker, +) -> Result>, ClientRequestError> { + debug!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.load(); + let tls = tls.as_deref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, client, mode.handshake_tls(tls), record_handshake_error); + + let (mut client, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake) + .await?? + { + HandshakeData::Startup(client, params) => (client, params), + HandshakeData::Cancel(cancel_key_data) => { + // spawn a task to cancel the session, but don't wait for it + cancellations.spawn({ + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let ctx = ctx.clone(); + let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); + cancel_span.follows_from(tracing::Span::current()); + async move { + cancellation_handler_clone + .cancel_session( + cancel_key_data, + ctx, + config.authentication_config.ip_allowlist_check_enabled, + config.authentication_config.is_vpc_acccess_proxy, + auth_backend.get_api(), + ) + .await + .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); + }.instrument(cancel_span) + }); + + return Ok(None); + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let common_names = tls.map(|tls| &tls.common_names); + + let (node, cancel_on_shutdown) = handle_client( + config, + auth_backend, + ctx, + cancellation_handler, + &mut client, + &mode, + endpoint_rate_limiter, + common_names, + ¶ms, + ) + .await?; + + let client = client.flush_and_into_inner().await?; + + let private_link_id = match ctx.extra() { + Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), + Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), + None => None, + }; + + Ok(Some(ProxyPassthrough { + client, + compute: node.stream, + + aux: node.aux, + private_link_id, + + _cancel_on_shutdown: cancel_on_shutdown, + + _req: request_gauge, + _conn: conn_gauge, + _db_conn: node.guage, + })) +} diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index d9c0585efb..08c81afa04 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -5,326 +5,64 @@ pub(crate) mod connect_compute; pub(crate) mod retry; pub(crate) mod wake_compute; +use std::collections::HashSet; +use std::convert::Infallible; use std::sync::Arc; -use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; use regex::Regex; use serde::{Deserialize, Serialize}; -use smol_str::{SmolStr, ToSmolStr, format_smolstr}; -use thiserror::Error; +use smol_str::{SmolStr, format_smolstr}; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::sync::CancellationToken; -use tracing::{Instrument, debug, error, info, warn}; +use tokio::sync::oneshot; +use tracing::Instrument; use crate::cache::Cache; -use crate::cancellation::{self, CancellationHandler}; -use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::cancellation::CancellationHandler; +use crate::compute::ComputeConnection; +use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::control_plane::client::ControlPlaneClient; -use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, NumClientConnectionsGuard}; pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute}; -use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake}; -use crate::pglb::passthrough::ProxyPassthrough; +use crate::pglb::{ClientMode, ClientRequestError}; use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams}; -use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol}; use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute}; use crate::proxy::retry::ShouldRetryWakeCompute; use crate::rate_limiter::EndpointRateLimiter; use crate::stream::{PqStream, Stream}; use crate::types::EndpointCacheKey; -use crate::util::run_until_cancelled; use crate::{auth, compute}; -const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; - -#[derive(Error, Debug)] -#[error("{ERR_INSECURE_CONNECTION}")] -pub struct TlsRequired; - -impl ReportableError for TlsRequired { - fn get_error_kind(&self) -> crate::error::ErrorKind { - crate::error::ErrorKind::User - } -} - -impl UserFacingError for TlsRequired {} - -pub async fn task_main( - config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, ()>, - listener: tokio::net::TcpListener, - cancellation_token: CancellationToken, - cancellation_handler: Arc, - endpoint_rate_limiter: Arc, -) -> anyhow::Result<()> { - scopeguard::defer! { - info!("proxy has shut down"); - } - - // When set for the server socket, the keepalive setting - // will be inherited by all accepted client sockets. - socket2::SockRef::from(&listener).set_keepalive(true)?; - - let connections = tokio_util::task::task_tracker::TaskTracker::new(); - let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); - - while let Some(accept_result) = - run_until_cancelled(listener.accept(), &cancellation_token).await - { - let (socket, peer_addr) = accept_result?; - - let conn_gauge = Metrics::get() - .proxy - .client_connections - .guard(crate::metrics::Protocol::Tcp); - - let session_id = uuid::Uuid::new_v4(); - let cancellation_handler = Arc::clone(&cancellation_handler); - let cancellations = cancellations.clone(); - - debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); - let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); - - connections.spawn(async move { - let (socket, conn_info) = match config.proxy_protocol_v2 { - ProxyProtocolV2::Required => { - match read_proxy_protocol(socket).await { - Err(e) => { - warn!("per-client task finished with an error: {e:#}"); - return; - } - // our load balancers will not send any more data. let's just exit immediately - Ok((_socket, ConnectHeader::Local)) => { - debug!("healthcheck received"); - return; - } - Ok((socket, ConnectHeader::Proxy(info))) => (socket, info), - } - } - // ignore the header - it cannot be confused for a postgres or http connection so will - // error later. - ProxyProtocolV2::Rejected => ( - socket, - ConnectionInfo { - addr: peer_addr, - extra: None, - }, - ), - }; - - match socket.set_nodelay(true) { - Ok(()) => {} - Err(e) => { - error!( - "per-client task finished with an error: failed to set socket option: {e:#}" - ); - return; - } - } - - let ctx = RequestContext::new(session_id, conn_info, crate::metrics::Protocol::Tcp); - - let res = handle_client( - config, - auth_backend, - &ctx, - cancellation_handler, - socket, - ClientMode::Tcp, - endpoint_rate_limiter2, - conn_gauge, - cancellations, - ) - .instrument(ctx.span()) - .boxed() - .await; - - match res { - Err(e) => { - ctx.set_error_kind(e.get_error_kind()); - warn!(parent: &ctx.span(), "per-client task finished with an error: {e:#}"); - } - Ok(None) => { - ctx.set_success(); - } - Ok(Some(p)) => { - ctx.set_success(); - let _disconnect = ctx.log_connect(); - match p.proxy_pass().await { - Ok(()) => {} - Err(ErrorSource::Client(e)) => { - warn!( - ?session_id, - "per-client task finished with an IO error from the client: {e:#}" - ); - } - Err(ErrorSource::Compute(e)) => { - error!( - ?session_id, - "per-client task finished with an IO error from the compute: {e:#}" - ); - } - } - } - } - }); - } - - connections.close(); - cancellations.close(); - drop(listener); - - // Drain connections - connections.wait().await; - cancellations.wait().await; - - Ok(()) -} - -pub(crate) enum ClientMode { - Tcp, - Websockets { hostname: Option }, -} - -/// Abstracts the logic of handling TCP vs WS clients -impl ClientMode { - pub(crate) fn allow_cleartext(&self) -> bool { - match self { - ClientMode::Tcp => false, - ClientMode::Websockets { .. } => true, - } - } - - fn hostname<'a, S>(&'a self, s: &'a Stream) -> Option<&'a str> { - match self { - ClientMode::Tcp => s.sni_hostname(), - ClientMode::Websockets { hostname } => hostname.as_deref(), - } - } - - fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> { - match self { - ClientMode::Tcp => tls, - // TLS is None here if using websockets, because the connection is already encrypted. - ClientMode::Websockets { .. } => None, - } - } -} - -#[derive(Debug, Error)] -// almost all errors should be reported to the user, but there's a few cases where we cannot -// 1. Cancellation: we are not allowed to tell the client any cancellation statuses for security reasons -// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation, -// we cannot be sure the client even understands our error message -// 3. PrepareClient: The client disconnected, so we can't tell them anyway... -pub(crate) enum ClientRequestError { - #[error("{0}")] - Cancellation(#[from] cancellation::CancelError), - #[error("{0}")] - Handshake(#[from] HandshakeError), - #[error("{0}")] - HandshakeTimeout(#[from] tokio::time::error::Elapsed), - #[error("{0}")] - PrepareClient(#[from] std::io::Error), - #[error("{0}")] - ReportedError(#[from] crate::stream::ReportedError), -} - -impl ReportableError for ClientRequestError { - fn get_error_kind(&self) -> crate::error::ErrorKind { - match self { - ClientRequestError::Cancellation(e) => e.get_error_kind(), - ClientRequestError::Handshake(e) => e.get_error_kind(), - ClientRequestError::HandshakeTimeout(_) => crate::error::ErrorKind::RateLimit, - ClientRequestError::ReportedError(e) => e.get_error_kind(), - ClientRequestError::PrepareClient(_) => crate::error::ErrorKind::ClientDisconnect, - } - } -} - #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestContext, cancellation_handler: Arc, - stream: S, - mode: ClientMode, + client: &mut PqStream>, + mode: &ClientMode, endpoint_rate_limiter: Arc, - conn_gauge: NumClientConnectionsGuard<'static>, - cancellations: tokio_util::task::task_tracker::TaskTracker, -) -> Result>, ClientRequestError> { - debug!( - protocol = %ctx.protocol(), - "handling interactive connection from client" - ); - - let metrics = &Metrics::get().proxy; - let proto = ctx.protocol(); - let request_gauge = metrics.connection_requests.guard(proto); - - let tls = config.tls_config.load(); - let tls = tls.as_deref(); - - let record_handshake_error = !ctx.has_private_peer_addr(); - let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); - let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error); - - let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake) - .await?? - { - HandshakeData::Startup(stream, params) => (stream, params), - HandshakeData::Cancel(cancel_key_data) => { - // spawn a task to cancel the session, but don't wait for it - cancellations.spawn({ - let cancellation_handler_clone = Arc::clone(&cancellation_handler); - let ctx = ctx.clone(); - let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?ctx.session_id()); - cancel_span.follows_from(tracing::Span::current()); - async move { - cancellation_handler_clone - .cancel_session( - cancel_key_data, - ctx, - config.authentication_config.ip_allowlist_check_enabled, - config.authentication_config.is_vpc_acccess_proxy, - auth_backend.get_api(), - ) - .await - .inspect_err(|e | debug!(error = ?e, "cancel_session failed")).ok(); - }.instrument(cancel_span) - }); - - return Ok(None); - } - }; - drop(pause); - - ctx.set_db_options(params.clone()); - - let hostname = mode.hostname(stream.get_ref()); - - let common_names = tls.map(|tls| &tls.common_names); - + common_names: Option<&HashSet>, + params: &StartupMessageParams, +) -> Result<(ComputeConnection, oneshot::Sender), ClientRequestError> { + let hostname = mode.hostname(client.get_ref()); // Extract credentials which we're going to use for auth. let result = auth_backend .as_ref() - .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) + .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, params, hostname, common_names)) .transpose(); let user_info = match result { Ok(user_info) => user_info, - Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + Err(e) => Err(client.throw_error(e, Some(ctx)).await)?, }; let user = user_info.get_user().to_owned(); let user_info = match user_info .authenticate( ctx, - &mut stream, + client, mode.allow_cleartext(), &config.authentication_config, endpoint_rate_limiter, @@ -337,7 +75,7 @@ pub(crate) async fn handle_client( let app = params.get("application_name"); let params_span = tracing::info_span!("", ?user, ?db, ?app); - return Err(stream + return Err(client .throw_error(e, Some(ctx)) .instrument(params_span) .await)?; @@ -350,7 +88,7 @@ pub(crate) async fn handle_client( }; let params_compat = creds.info.options.get(NeonOptions::PARAMS_COMPAT).is_some(); let mut auth_info = compute::AuthInfo::with_auth_keys(creds.keys); - auth_info.set_startup_params(¶ms, params_compat); + auth_info.set_startup_params(params, params_compat); let mut node; let mut attempt = 0; @@ -370,6 +108,7 @@ pub(crate) async fn handle_client( let pg_settings = loop { attempt += 1; + // TODO: callback to pglb let res = connect_to_compute( ctx, &connect, @@ -381,7 +120,7 @@ pub(crate) async fn handle_client( match res { Ok(n) => node = n, - Err(e) => return Err(stream.throw_error(e, Some(ctx)).await)?, + Err(e) => return Err(client.throw_error(e, Some(ctx)).await)?, } let auth::Backend::ControlPlane(cplane, user_info) = &backend else { @@ -400,17 +139,16 @@ pub(crate) async fn handle_client( cplane_proxy_v1.caches.node_info.invalidate(&key); } } - Err(e) => Err(stream.throw_error(e, Some(ctx)).await)?, + Err(e) => Err(client.throw_error(e, Some(ctx)).await)?, } }; let session = cancellation_handler.get_key(); - prepare_client_connection(&pg_settings, *session.key(), &mut stream); - let stream = stream.flush_and_into_inner().await?; + finish_client_init(&pg_settings, *session.key(), client); let session_id = ctx.session_id(); - let (cancel_on_shutdown, cancel) = tokio::sync::oneshot::channel(); + let (cancel_on_shutdown, cancel) = oneshot::channel(); tokio::spawn(async move { session .maintain_cancel_key( @@ -422,50 +160,32 @@ pub(crate) async fn handle_client( .await; }); - let private_link_id = match ctx.extra() { - Some(ConnectionInfoExtra::Aws { vpce_id }) => Some(vpce_id.clone()), - Some(ConnectionInfoExtra::Azure { link_id }) => Some(link_id.to_smolstr()), - None => None, - }; - - Ok(Some(ProxyPassthrough { - client: stream, - compute: node.stream, - - aux: node.aux, - private_link_id, - - _cancel_on_shutdown: cancel_on_shutdown, - - _req: request_gauge, - _conn: conn_gauge, - _db_conn: node.guage, - })) + Ok((node, cancel_on_shutdown)) } /// Finish client connection initialization: confirm auth success, send params, etc. -pub(crate) fn prepare_client_connection( +pub(crate) fn finish_client_init( settings: &compute::PostgresSettings, cancel_key_data: CancelKeyData, - stream: &mut PqStream, + client: &mut PqStream, ) { // Forward all deferred notices to the client. for notice in &settings.delayed_notice { - stream.write_raw(notice.as_bytes().len(), b'N', |buf| { + client.write_raw(notice.as_bytes().len(), b'N', |buf| { buf.extend_from_slice(notice.as_bytes()); }); } // Forward all postgres connection params to the client. for (name, value) in &settings.params { - stream.write_message(BeMessage::ParameterStatus { + client.write_message(BeMessage::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), }); } - stream.write_message(BeMessage::BackendKeyData(cancel_key_data)); - stream.write_message(BeMessage::ReadyForQuery); + client.write_message(BeMessage::BackendKeyData(cancel_key_data)); + client.write_message(BeMessage::ReadyForQuery); } #[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)] @@ -475,7 +195,7 @@ impl NeonOptions { // proxy options: /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute. - const PARAMS_COMPAT: &str = "proxy_params_compat"; + pub const PARAMS_COMPAT: &str = "proxy_params_compat"; // cplane options: diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 67dd0ab522..b09d8edc4c 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -14,6 +14,9 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, DuplexStream}; use tokio_util::codec::{Decoder, Encoder}; use super::*; +use crate::config::TlsConfig; +use crate::context::RequestContext; +use crate::pglb::handshake::{HandshakeData, handshake}; enum Intercept { None, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 4f27496019..dd89b05426 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -3,6 +3,7 @@ mod mitm; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail}; @@ -10,26 +11,31 @@ use async_trait::async_trait; use http::StatusCode; use postgres_client::config::SslMode; use postgres_client::tls::{MakeTlsConnect, NoTls}; -use retry::{ShouldRetryWakeCompute, retry_after}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; -use tokio::io::DuplexStream; +use tokio::io::{AsyncRead, AsyncWrite, DuplexStream}; use tracing_test::traced_test; use super::retry::CouldRetry; -use super::*; use crate::auth::backend::{ComputeUserInfo, MaybeOwned}; -use crate::config::{ComputeConfig, RetryConfig}; +use crate::config::{ComputeConfig, RetryConfig, TlsConfig}; +use crate::context::RequestContext; use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient}; use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status}; use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache}; -use crate::error::ErrorKind; -use crate::proxy::connect_compute::ConnectMechanism; +use crate::error::{ErrorKind, ReportableError}; +use crate::pglb::ERR_INSECURE_CONNECTION; +use crate::pglb::handshake::{HandshakeData, handshake}; +use crate::pqproto::BeMessage; +use crate::proxy::NeonOptions; +use crate::proxy::connect_compute::{ConnectMechanism, connect_to_compute}; +use crate::proxy::retry::{ShouldRetryWakeCompute, retry_after}; +use crate::stream::{PqStream, Stream}; use crate::tls::client_config::compute_client_config_with_certs; use crate::tls::server_config::CertResolver; use crate::types::{BranchId, EndpointId, ProjectId}; -use crate::{sasl, scram}; +use crate::{auth, compute, sasl, scram}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 0d374e6df2..1960709fba 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -17,7 +17,8 @@ use crate::config::ProxyConfig; use crate::context::RequestContext; use crate::error::ReportableError; use crate::metrics::Metrics; -use crate::proxy::{ClientMode, ErrorSource, handle_client}; +use crate::pglb::{ClientMode, handle_connection}; +use crate::proxy::ErrorSource; use crate::rate_limiter::EndpointRateLimiter; pin_project! { @@ -142,7 +143,7 @@ pub(crate) async fn serve_websocket( .client_connections .guard(crate::metrics::Protocol::Ws); - let res = Box::pin(handle_client( + let res = Box::pin(handle_connection( config, auth_backend, &ctx, From e33e1094031578b384f448876af0048b14421b50 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 27 Jun 2025 17:26:00 +0200 Subject: [PATCH 39/50] fix(pageserver): buffered writer cancellation error handling (#12376) ## Problem The problem has been well described in already-commited PR #11853. tl;dr: BufferedWriter is sensitive to cancellation, which the previous approach was not. The write path was most affected (ingest & compaction), which was mostly fixed in #11853: it introduced `PutError` and mapped instances of `PutError` that were due to cancellation of underlying buffered writer into `CreateImageLayersError::Cancelled`. However, there is a long tail of remaining errors that weren't caught by #11853 that result in `CompactionError::Other`s, which we log with great noise. ## Solution The stack trace logging for CompactionError::Other added in #11853 allows us to chop away at that long tail using the following pattern: - look at the stack trace - from leaf up, identify the place where we incorrectly map from the distinguished variant X indicating cancellation to an `anyhow::Error` - follow that anyhow further up, ensuring it stays the same anyhow all the way up in the `CompactionError::Other` - since it stayed one anyhow chain all the way up, root_cause() will yield us X - so, in `log_compaction_error`, add an additional `downcast_ref` check for X This PR specifically adds checks for - the flush task cancelling (FlushTaskError, BlobWriterError) - opening of the layer writer (GateError) That should cover all the reports in issues - https://github.com/neondatabase/cloud/issues/29434 - https://github.com/neondatabase/neon/issues/12162 ## Refs - follow-up to #11853 - fixup of / fixes https://github.com/neondatabase/neon/issues/11762 - fixes https://github.com/neondatabase/neon/issues/12162 - refs https://github.com/neondatabase/cloud/issues/29434 --- libs/utils/src/sync/gate.rs | 8 ++++++++ pageserver/src/tenant/tasks.rs | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 93460785bf..862b2cff9e 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -86,6 +86,14 @@ pub enum GateError { GateClosed, } +impl GateError { + pub fn is_cancel(&self) -> bool { + match self { + GateError::GateClosed => true, + } + } +} + impl Default for Gate { fn default() -> Self { Self { diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 4709a6d616..954dd38bb4 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -17,14 +17,17 @@ use tracing::*; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; +use utils::sync::gate::GateError; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::blob_io::WriteBlobError; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::{TenantShard, TenantState}; +use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// Semaphore limiting concurrent background tasks (across all tenants). /// @@ -313,7 +316,20 @@ pub(crate) fn log_compaction_error( let timeline = root_cause .downcast_ref::() .is_some_and(|e| e.is_stopping()); - let is_stopping = upload_queue || timeline; + let buffered_writer_flush_task_canelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let write_blob_cancelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let gate_closed = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let is_stopping = upload_queue + || timeline + || buffered_writer_flush_task_canelled + || write_blob_cancelled + || gate_closed; if is_stopping { Level::INFO From e50b914a8eefa35a79a64bdb7715c0c102f94381 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 27 Jun 2025 18:39:00 +0200 Subject: [PATCH 40/50] compute_tools: support gRPC base backups in `compute_ctl` (#12244) ## Problem `compute_ctl` should support gRPC base backups. Requires #12111. Requires #12243. Touches #11926. ## Summary of changes Support `grpc://` connstrings for `compute_ctl` base backups. --- Cargo.lock | 6 +- Cargo.toml | 2 +- build-tools.Dockerfile | 1 + compute/compute-node.Dockerfile | 15 ++- compute_tools/Cargo.toml | 2 + compute_tools/src/compute.rs | 105 +++++++++++++++++---- pageserver/page_api/Cargo.toml | 3 +- pageserver/page_api/src/client.rs | 37 +++----- pageserver/page_api/src/model.rs | 4 +- pageserver/pagebench/src/cmd/basebackup.rs | 5 +- workspace_hack/Cargo.toml | 4 +- 11 files changed, 128 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7098711bb4..71e78243a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1316,6 +1316,7 @@ dependencies = [ "opentelemetry", "opentelemetry_sdk", "p256 0.13.2", + "pageserver_page_api", "postgres", "postgres_initdb", "postgres_versioninfo", @@ -1335,6 +1336,7 @@ dependencies = [ "tokio-postgres", "tokio-stream", "tokio-util", + "tonic 0.13.1", "tower 0.5.2", "tower-http", "tower-otel", @@ -4475,12 +4477,13 @@ dependencies = [ "bytes", "futures", "pageserver_api", - "postgres_ffi", + "postgres_ffi_types", "prost 0.13.5", "strum", "strum_macros", "thiserror 1.0.69", "tokio", + "tokio-util", "tonic 0.13.1", "tonic-build", "utils", @@ -8679,7 +8682,6 @@ dependencies = [ "num-iter", "num-rational", "num-traits", - "once_cell", "p256 0.13.2", "parquet", "prettyplease", diff --git a/Cargo.toml b/Cargo.toml index 857bc5d5d9..aeb7976b6c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -199,7 +199,7 @@ tokio-postgres-rustls = "0.12.0" tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]} tokio-stream = "0.1" tokio-tar = "0.3" -tokio-util = { version = "0.7.10", features = ["io", "rt"] } +tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] } toml = "0.8" toml_edit = "0.22" tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] } diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index b70ced7886..14a52bd736 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -165,6 +165,7 @@ RUN curl -fsSL \ && rm sql_exporter.tar.gz # protobuf-compiler (protoc) +# Keep the version the same as in compute/compute-node.Dockerfile ENV PROTOC_VERSION=25.1 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ && unzip -q protoc.zip -d protoc \ diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 35ece73030..bce2a28b8b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -115,6 +115,9 @@ ARG EXTENSIONS=all FROM $BASE_IMAGE_SHA AS build-deps ARG DEBIAN_VERSION +# Keep in sync with build-tools.Dockerfile +ENV PROTOC_VERSION=25.1 + # Use strict mode for bash to catch errors early SHELL ["/bin/bash", "-euo", "pipefail", "-c"] @@ -149,8 +152,14 @@ RUN case $DEBIAN_VERSION in \ libclang-dev \ jsonnet \ $VERSION_INSTALLS \ - && apt clean && rm -rf /var/lib/apt/lists/* && \ - useradd -ms /bin/bash nonroot -b /home + && apt clean && rm -rf /var/lib/apt/lists/* \ + && useradd -ms /bin/bash nonroot -b /home \ + # Install protoc from binary release, since Debian's versions are too old. + && curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \ + && unzip -q protoc.zip -d protoc \ + && mv protoc/bin/protoc /usr/local/bin/protoc \ + && mv protoc/include/google /usr/local/include/google \ + && rm -rf protoc.zip protoc ######################################################################################### # @@ -1170,7 +1179,7 @@ COPY --from=pgrag-src /ext-src/ /ext-src/ # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise WORKDIR /ext-src/onnxruntime-src RUN apt update && apt install --no-install-recommends --no-install-suggests -y \ - python3 python3-pip python3-venv protobuf-compiler && \ + python3 python3-pip python3-venv && \ apt clean && rm -rf /var/lib/apt/lists/* && \ python3 -m venv venv && \ . venv/bin/activate && \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index a5879c4b7c..0a071c1ad1 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -38,6 +38,7 @@ once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true p256 = { version = "0.13", features = ["pem"] } +pageserver_page_api.workspace = true postgres.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } @@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true tokio-stream.workspace = true +tonic.workspace = true tower-otel.workspace = true tracing.workspace = true tracing-opentelemetry.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index cf558ee01a..7566626d57 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,4 @@ -use anyhow::{Context, Result}; +use anyhow::{Context, Result, anyhow}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ @@ -15,6 +15,7 @@ use itertools::Itertools; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use once_cell::sync::Lazy; +use pageserver_page_api::{self as page_api, BaseBackupCompression}; use postgres; use postgres::NoTls; use postgres::error::SqlState; @@ -35,6 +36,7 @@ use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; use utils::measured_stream::MeasuredReader; use utils::pid_file; +use utils::shard::{ShardCount, ShardIndex, ShardNumber}; use crate::configurator::launch_configurator; use crate::disk_quota::set_disk_quota; @@ -995,13 +997,87 @@ impl ComputeNode { Ok(()) } - // Get basebackup from the libpq connection to pageserver using `connstr` and - // unarchive it to `pgdata` directory overriding all its previous content. + /// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and + /// unarchives it to `pgdata` directory, replacing any existing contents. #[instrument(skip_all, fields(%lsn))] fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); - let start_time = Instant::now(); + // Detect the protocol scheme. If the URL doesn't have a scheme, assume libpq. + let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); + let scheme = match Url::parse(shard0_connstr) { + Ok(url) => url.scheme().to_lowercase().to_string(), + Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), + Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), + }; + + let started = Instant::now(); + let (connected, size) = match scheme.as_str() { + "postgresql" | "postgres" => self.try_get_basebackup_libpq(spec, lsn)?, + "grpc" => self.try_get_basebackup_grpc(spec, lsn)?, + scheme => return Err(anyhow!("unknown URL scheme {scheme}")), + }; + + let mut state = self.state.lock().unwrap(); + state.metrics.pageserver_connect_micros = + connected.duration_since(started).as_micros() as u64; + state.metrics.basebackup_bytes = size as u64; + state.metrics.basebackup_ms = started.elapsed().as_millis() as u64; + + Ok(()) + } + + /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when + /// the connection was established, and the (compressed) size of the basebackup. + fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { + let shard0_connstr = spec + .pageserver_connstr + .split(',') + .next() + .unwrap() + .to_string(); + let shard_index = match spec.pageserver_connstr.split(',').count() as u8 { + 0 | 1 => ShardIndex::unsharded(), + count => ShardIndex::new(ShardNumber(0), ShardCount(count)), + }; + + let (reader, connected) = tokio::runtime::Handle::current().block_on(async move { + let mut client = page_api::Client::new( + shard0_connstr, + spec.tenant_id, + spec.timeline_id, + shard_index, + spec.storage_auth_token.clone(), + None, // NB: base backups use payload compression + ) + .await?; + let connected = Instant::now(); + let reader = client + .get_base_backup(page_api::GetBaseBackupRequest { + lsn: (lsn != Lsn(0)).then_some(lsn), + compression: BaseBackupCompression::Gzip, + replica: spec.spec.mode != ComputeMode::Primary, + full: false, + }) + .await?; + anyhow::Ok((reader, connected)) + })?; + + let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader)); + + // Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the + // end-of-archive marker. If the server errors, the tar::Builder drop handler will write an + // end-of-archive marker before the error is emitted, and we would not see the error. + let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader)); + ar.set_ignore_zeros(true); + ar.unpack(&self.params.pgdata)?; + + Ok((connected, reader.get_byte_count())) + } + + /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp + /// when the connection was established, and the (compressed) size of the basebackup. + fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> { let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); let mut config = postgres::Config::from_str(shard0_connstr)?; @@ -1015,16 +1091,14 @@ impl ComputeNode { } config.application_name("compute_ctl"); - if let Some(spec) = &compute_state.pspec { - config.options(&format!( - "-c neon.compute_mode={}", - spec.spec.mode.to_type_str() - )); - } + config.options(&format!( + "-c neon.compute_mode={}", + spec.spec.mode.to_type_str() + )); // Connect to pageserver let mut client = config.connect(NoTls)?; - let pageserver_connect_micros = start_time.elapsed().as_micros() as u64; + let connected = Instant::now(); let basebackup_cmd = match lsn { Lsn(0) => { @@ -1061,16 +1135,13 @@ impl ComputeNode { // Set `ignore_zeros` so that unpack() reads all the Copy data and // doesn't stop at the end-of-archive marker. Otherwise, if the server // sends an Error after finishing the tarball, we will not notice it. + // The tar::Builder drop handler will write an end-of-archive marker + // before emitting the error, and we would not see it otherwise. let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader)); ar.set_ignore_zeros(true); ar.unpack(&self.params.pgdata)?; - // Report metrics - let mut state = self.state.lock().unwrap(); - state.metrics.pageserver_connect_micros = pageserver_connect_micros; - state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64; - state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64; - Ok(()) + Ok((connected, measured_reader.get_byte_count())) } // Gets the basebackup in a retry loop diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index c5283c2b09..42ee9b50e9 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -9,12 +9,13 @@ anyhow.workspace = true bytes.workspace = true futures.workspace = true pageserver_api.workspace = true -postgres_ffi.workspace = true +postgres_ffi_types.workspace = true prost.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true tonic.workspace = true utils.workspace = true workspace_hack.workspace = true diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 71d539ab91..4b456787d2 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -1,8 +1,7 @@ -use std::convert::TryInto; - -use bytes::Bytes; -use futures::TryStreamExt; -use futures::{Stream, StreamExt}; +use anyhow::Result; +use futures::{Stream, StreamExt as _, TryStreamExt as _}; +use tokio::io::AsyncRead; +use tokio_util::io::StreamReader; use tonic::metadata::AsciiMetadataValue; use tonic::metadata::errors::InvalidMetadataValue; use tonic::transport::Channel; @@ -12,8 +11,6 @@ use utils::id::TenantId; use utils::id::TimelineId; use utils::shard::ShardIndex; -use anyhow::Result; - use crate::model; use crate::proto; @@ -69,6 +66,7 @@ impl tonic::service::Interceptor for AuthInterceptor { Ok(req) } } + #[derive(Clone)] pub struct Client { client: proto::PageServiceClient< @@ -120,22 +118,15 @@ impl Client { pub async fn get_base_backup( &mut self, req: model::GetBaseBackupRequest, - ) -> Result> + 'static, tonic::Status> { - let proto_req = proto::GetBaseBackupRequest::from(req); - - let response_stream: Streaming = - self.client.get_base_backup(proto_req).await?.into_inner(); - - // TODO: Consider dechunking internally - let domain_stream = response_stream.map(|chunk_res| { - chunk_res.and_then(|proto_chunk| { - proto_chunk.try_into().map_err(|e| { - tonic::Status::internal(format!("Failed to convert response chunk: {e}")) - }) - }) - }); - - Ok(domain_stream) + ) -> Result, tonic::Status> { + let req = proto::GetBaseBackupRequest::from(req); + let chunks = self.client.get_base_backup(req).await?.into_inner(); + let reader = StreamReader::new( + chunks + .map_ok(|resp| resp.chunk) + .map_err(std::io::Error::other), + ); + Ok(reader) } /// Returns the total size of a database, as # of bytes. diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 1ca89b4870..0493f79781 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -18,8 +18,8 @@ use std::fmt::Display; use bytes::Bytes; -use postgres_ffi::Oid; -// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid +use postgres_ffi_types::Oid; +// TODO: split out Lsn, RelTag, SlruKind and other basic types to a separate crate, to avoid // pulling in all of their other crate dependencies when building the client. use utils::lsn::Lsn; diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 4111d09f92..4b7a70504a 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -355,9 +355,6 @@ impl Client for GrpcClient { full: false, compression: self.compression, }; - let stream = self.inner.get_base_backup(req).await?; - Ok(Box::pin(StreamReader::new( - stream.map_err(std::io::Error::other), - ))) + Ok(Box::pin(self.inner.get_base_backup(req).await?)) } } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index b74df50f86..e9a77ca2d6 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -68,7 +68,6 @@ num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } -once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } @@ -97,7 +96,7 @@ time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1", features = ["net"] } -tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } +tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] } toml_edit = { version = "0.22", features = ["serde"] } tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] } tracing = { version = "0.1", features = ["log"] } @@ -134,7 +133,6 @@ num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } -once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } From 47553dbaf946de9cdab759c220ec56dffa5f82cd Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Sat, 28 Jun 2025 16:59:29 +0400 Subject: [PATCH 41/50] neon_local: set timeline_safekeeper_count if we have less than 3 safekeepers (#12378) ## Problem - Closes: https://github.com/neondatabase/neon/issues/12298 ## Summary of changes - Set `timeline_safekeeper_count` in `neon_local` if we have less than 3 safekeepers - Remove `cfg!(feature = "testing")` code from `safekeepers_for_new_timeline` - Change `timeline_safekeeper_count` type to `usize` --- control_plane/src/local_env.rs | 2 +- control_plane/src/storage_controller.rs | 8 +++++++- storage_controller/src/main.rs | 4 ++-- storage_controller/src/service.rs | 2 +- storage_controller/src/service/safekeeper_service.rs | 9 +-------- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 16cd2d8c08..d0611113e8 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -212,7 +212,7 @@ pub struct NeonStorageControllerConf { pub use_local_compute_notifications: bool, - pub timeline_safekeeper_count: Option, + pub timeline_safekeeper_count: Option, pub posthog_config: Option, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index dea7ae2ccf..bb83a6319c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -638,7 +638,13 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } - if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + // neon_local is used in test environments where we often have less than 3 safekeepers. + if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 { + let sk_cnt = self + .config + .timeline_safekeeper_count + .unwrap_or(self.env.safekeepers.len()); + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index d1c2858d6f..752262b65e 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -216,8 +216,8 @@ struct Cli { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. /// This option exists primarily for testing purposes. - #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] - timeline_safekeeper_count: i64, + #[arg(long, default_value = "3", value_parser = clap::builder::RangedU64ValueParser::::new().range(1..))] + timeline_safekeeper_count: usize, /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. /// This speed up migrations by avoiding the default wait for the heatmap download interval. diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b4dfd01249..19bb0f8671 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -471,7 +471,7 @@ pub struct Config { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. - pub timeline_safekeeper_count: i64, + pub timeline_safekeeper_count: usize, /// PostHog integration config pub posthog_config: Option, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fec81fb661..92d15f3fca 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,4 +1,3 @@ -use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; @@ -654,13 +653,7 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let mut wanted_count = self.config.timeline_safekeeper_count as usize; - // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release - // branch and is specified in tests/neon_local config. - if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { - // In testing mode, we can have less safekeepers than the config says - wanted_count = max(all_safekeepers.len(), 1); - } + let wanted_count = self.config.timeline_safekeeper_count; let mut sks = Vec::new(); let mut azs = HashSet::new(); From 9bb4688c541fd66b3a342ac4f1cd0784bd807fbd Mon Sep 17 00:00:00 2001 From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com> Date: Mon, 30 Jun 2025 09:41:05 +0400 Subject: [PATCH 42/50] storcon: Remove testing feature from kick_secondary_downloads (#12383) ## Problem Some of the design decisions in PR #12256 were influenced by the requirements of consistency tests. These decisions introduced intermediate logic that is no longer needed and should be cleaned up. ## Summary of Changes - Remove the `feature("testing")` flag related to `kick_secondary_download`. - Set the default value of `kick_secondary_download` back to false, reflecting the intended production behavior. Co-authored-by: Aleksandr Sarantsev --- storage_controller/src/main.rs | 8 ++------ storage_controller/src/service.rs | 4 +--- test_runner/fixtures/neon_fixtures.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 752262b65e..2a851dc25b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -6,9 +6,7 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8PathBuf; -#[cfg(feature = "testing")] -use clap::ArgAction; -use clap::Parser; +use clap::{ArgAction, Parser}; use futures::future::OptionFuture; use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; @@ -222,8 +220,7 @@ struct Cli { /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. /// This speed up migrations by avoiding the default wait for the heatmap download interval. /// Primarily useful for testing to reduce test execution time. - #[cfg(feature = "testing")] - #[arg(long, default_value = "true", action=ArgAction::Set)] + #[arg(long, default_value = "false", action=ArgAction::Set)] kick_secondary_downloads: bool, } @@ -472,7 +469,6 @@ async fn async_main() -> anyhow::Result<()> { use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, posthog_config: posthog_config.clone(), - #[cfg(feature = "testing")] kick_secondary_downloads: args.kick_secondary_downloads, }; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 19bb0f8671..75ce7bc37b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -476,7 +476,7 @@ pub struct Config { /// PostHog integration config pub posthog_config: Option, - #[cfg(feature = "testing")] + /// When set, actively checks and initiates heatmap downloads/uploads. pub kick_secondary_downloads: bool, } @@ -8364,7 +8364,6 @@ impl Service { "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" ); - #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { // No heatmap might mean the attached location has never uploaded one, or that // the secondary download hasn't happened yet. This is relatively unusual in the field, @@ -8389,7 +8388,6 @@ impl Service { /// happens on multi-minute timescales in the field, which is fine because optimisation is meant /// to be a lazy background thing. However, when testing, it is not practical to wait around, so /// we have this helper to move things along faster. - #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { if !self.config.kick_secondary_downloads { // No-op if kick_secondary_downloads functionaliuty is not configured diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4eb85119ca..48c6597c7c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -453,7 +453,7 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, - storcon_kick_secondary_downloads: bool | None = None, + storcon_kick_secondary_downloads: bool | None = True, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override From c746678bbc215937e5b3dbead0697386e0815780 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Mon, 30 Jun 2025 12:30:05 +0400 Subject: [PATCH 43/50] storcon: implement safekeeper_migrate handler (#11849) This PR implements a safekeeper migration algorithm from RFC-035 https://github.com/neondatabase/neon/blob/main/docs/rfcs/035-safekeeper-dynamic-membership-change.md#change-algorithm - Closes: https://github.com/neondatabase/neon/issues/11823 It is not production-ready yet, but I think it's good enough to commit and start testing. There are some known issues which will be addressed in later PRs: - https://github.com/neondatabase/neon/issues/12186 - https://github.com/neondatabase/neon/issues/12187 - https://github.com/neondatabase/neon/issues/12188 - https://github.com/neondatabase/neon/issues/12189 - https://github.com/neondatabase/neon/issues/12190 - https://github.com/neondatabase/neon/issues/12191 - https://github.com/neondatabase/neon/issues/12192 ## Summary of changes - Implement `tenant_timeline_safekeeper_migrate` handler to drive the migration - Add possibility to specify number of safekeepers per timeline in tests (`timeline_safekeeper_count`) - Add `term` and `flush_lsn` to `TimelineMembershipSwitchResponse` - Implement compare-and-swap (CAS) operation over timeline in DB for updating membership configuration safely. - Write simple test to verify that migration code works --- libs/pageserver_api/src/controller_api.rs | 5 + libs/safekeeper_api/src/models.rs | 4 +- safekeeper/src/state.rs | 14 +- safekeeper/src/timeline.rs | 9 +- safekeeper/src/wal_storage.rs | 6 +- storage_controller/src/http.rs | 39 +- storage_controller/src/metrics.rs | 2 + storage_controller/src/persistence.rs | 54 ++ storage_controller/src/safekeeper.rs | 8 + storage_controller/src/safekeeper_client.rs | 4 + storage_controller/src/service.rs | 2 + .../src/service/safekeeper_reconciler.rs | 2 +- .../src/service/safekeeper_service.rs | 646 ++++++++++++++++-- test_runner/fixtures/neon_fixtures.py | 22 + .../regress/test_safekeeper_migration.py | 64 ++ 15 files changed, 802 insertions(+), 79 deletions(-) create mode 100644 test_runner/regress/test_safekeeper_migration.py diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ff18d40bfe..a8080a57e9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -546,6 +546,11 @@ pub struct TimelineImportRequest { pub sk_set: Vec, } +#[derive(serde::Serialize, serde::Deserialize, Clone)] +pub struct TimelineSafekeeperMigrateRequest { + pub new_sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 5c1ee41f7b..1774489c1c 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -210,7 +210,7 @@ pub struct TimelineStatus { } /// Request to switch membership configuration. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] #[serde(transparent)] pub struct TimelineMembershipSwitchRequest { pub mconf: Configuration, @@ -221,6 +221,8 @@ pub struct TimelineMembershipSwitchRequest { pub struct TimelineMembershipSwitchResponse { pub previous_conf: Configuration, pub current_conf: Configuration, + pub term: Term, + pub flush_lsn: Lsn, } #[derive(Clone, Copy, Serialize, Deserialize)] diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index b6cf73be2e..32624d260d 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -9,7 +9,7 @@ use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::models::TimelineTermBumpResponse; use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -83,6 +83,11 @@ pub enum EvictionState { Offloaded(Lsn), } +pub struct MembershipSwitchResult { + pub previous_conf: Configuration, + pub current_conf: Configuration, +} + impl TimelinePersistentState { /// commit_lsn is the same as start_lsn in the normal creaiton; see /// `TimelineCreateRequest` comments.` @@ -261,10 +266,7 @@ where /// Switch into membership configuration `to` if it is higher than the /// current one. - pub async fn membership_switch( - &mut self, - to: Configuration, - ) -> Result { + pub async fn membership_switch(&mut self, to: Configuration) -> Result { let before = self.mconf.clone(); // Is switch allowed? if to.generation <= self.mconf.generation { @@ -278,7 +280,7 @@ where self.finish_change(&state).await?; info!("switched membership conf to {} from {}", to, before); } - Ok(TimelineMembershipSwitchResponse { + Ok(MembershipSwitchResult { previous_conf: before, current_conf: self.mconf.clone(), }) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 2bee41537f..0a27876862 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -190,7 +190,14 @@ impl StateSK { &mut self, to: Configuration, ) -> Result { - self.state_mut().membership_switch(to).await + let result = self.state_mut().membership_switch(to).await?; + + Ok(TimelineMembershipSwitchResponse { + previous_conf: result.previous_conf, + current_conf: result.current_conf, + term: self.state().acceptor_state.term, + flush_lsn: self.flush_lsn(), + }) } /// Close open WAL files to release FDs. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 33310706be..70e53d86ee 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -154,8 +154,8 @@ pub struct PhysicalStorage { /// record /// /// Partial segment 002 has no WAL records, and it will be removed by the - /// next truncate_wal(). This flag will be set to true after the first - /// truncate_wal() call. + /// next truncate_wal(). This flag will be set to false after the first + /// successful truncate_wal() call. /// /// [`write_lsn`]: Self::write_lsn pending_wal_truncation: bool, @@ -202,6 +202,8 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { + // note: can never happen. find_end_of_wal returns provided start_lsn + // (state.commit_lsn in our case) if it doesn't find anything. bail!( "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a7e86b5224..66c44b5674 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,7 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, - TimelineImportRequest, + TimelineImportRequest, TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -34,6 +34,7 @@ use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; + use routerify::Middleware; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -635,6 +636,32 @@ async fn handle_tenant_timeline_download_heatmap_layers( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_safekeeper_migrate( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let migrate_req = json_request::(&mut req).await?; + + service + .tenant_timeline_safekeeper_migrate(tenant_id, timeline_id, migrate_req) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_lsn_lease( service: Arc, req: Request, @@ -2458,6 +2485,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_safekeeper_migrate, + RequestName("v1_tenant_timeline_safekeeper_migrate"), + ) + }, + ) // LSN lease passthrough to all shards .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 07713c3fbc..f7f77cdd23 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -333,6 +333,7 @@ pub(crate) enum DatabaseErrorLabel { ConnectionPool, Logical, Migration, + Cas, } impl DatabaseError { @@ -343,6 +344,7 @@ impl DatabaseError { Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, Self::Logical(_) => DatabaseErrorLabel::Logical, Self::Migration(_) => DatabaseErrorLabel::Migration, + Self::Cas(_) => DatabaseErrorLabel::Cas, } } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 2948e9019f..56f4d03111 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -29,6 +29,7 @@ use pageserver_api::shard::{ use rustls::client::WebPkiServerVerifier; use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; +use safekeeper_api::membership::SafekeeperGeneration; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -94,6 +95,8 @@ pub(crate) enum DatabaseError { Logical(String), #[error("Migration error: {0}")] Migration(String), + #[error("CAS error: {0}")] + Cas(String), } #[derive(measured::FixedCardinalityLabel, Copy, Clone)] @@ -126,6 +129,7 @@ pub(crate) enum DatabaseOperation { UpdateLeader, SetPreferredAzs, InsertTimeline, + UpdateTimelineMembership, GetTimeline, InsertTimelineReconcile, RemoveTimelineReconcile, @@ -1410,6 +1414,56 @@ impl Persistence { .await } + /// Update timeline membership configuration in the database. + /// Perform a compare-and-swap (CAS) operation on the timeline's generation. + /// The `new_generation` must be the next (+1) generation after the one in the database. + pub(crate) async fn update_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_generation: SafekeeperGeneration, + sk_set: &[NodeId], + new_sk_set: Option<&[NodeId]>, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let prev_generation = new_generation.previous().unwrap(); + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| { + Box::pin(async move { + let updated = diesel::update(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::generation.eq(prev_generation.into_inner() as i32)) + .set(( + dsl::generation.eq(new_generation.into_inner() as i32), + dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::>()), + dsl::new_sk_set.eq(new_sk_set + .map(|set| set.iter().map(|id| id.0 as i64).collect::>())), + )) + .execute(conn) + .await?; + + match updated { + 0 => { + // TODO(diko): It makes sense to select the current generation + // and include it in the error message for better debuggability. + Err(DatabaseError::Cas( + "Failed to update membership configuration".to_string(), + )) + } + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))), + } + }) + }) + .await + } + /// Load timeline from db. Returns `None` if not present. pub(crate) async fn get_timeline( &self, diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 5a13ef750e..91154f4fa3 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -2,6 +2,7 @@ use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use reqwest::StatusCode; +use safekeeper_api::membership::SafekeeperId; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -92,6 +93,13 @@ impl Safekeeper { pub(crate) fn has_https_port(&self) -> bool { self.listen_https_port.is_some() } + pub(crate) fn get_safekeeper_id(&self) -> SafekeeperId { + SafekeeperId { + id: self.id, + host: self.skp.host.clone(), + pg_port: self.skp.port as u16, + } + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index bcf223c731..47a785e7d3 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -56,6 +56,10 @@ impl SafekeeperClient { } } + pub(crate) fn node_id_label(&self) -> &str { + &self.node_id_label + } + pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 75ce7bc37b..bbf93fd751 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -161,6 +161,7 @@ enum TenantOperations { DropDetached, DownloadHeatmapLayers, TimelineLsnLease, + TimelineSafekeeperMigrate, } #[derive(Clone, strum_macros::Display)] @@ -491,6 +492,7 @@ impl From for ApiError { DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => { ApiError::InternalServerError(anyhow::anyhow!(reason)) } + DatabaseError::Cas(reason) => ApiError::Conflict(reason), } } } diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index a3c5082be6..b67a679fad 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -145,7 +145,7 @@ pub(crate) async fn load_schedule_requests( } let Some(sk) = safekeepers.get(&other_node_id) else { tracing::warn!( - "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it" + "couldn't find safekeeper with pending op id {other_node_id}, not pulling from it" ); return None; }; diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 92d15f3fca..fc33a24198 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -5,20 +5,29 @@ use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; use crate::heartbeater::SafekeeperState; +use crate::id_lock_map::trace_shared_lock; use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::safekeeper_client::SafekeeperClient; +use crate::service::TenantOperations; use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, + TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; -use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; +use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; +use safekeeper_api::models::{ + PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, +}; +use safekeeper_api::{INITIAL_TERM, Term}; +use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; @@ -35,6 +44,33 @@ pub struct TimelineLocateResponse { } impl Service { + fn make_member_set(safekeepers: &[Safekeeper]) -> Result { + let members = safekeepers + .iter() + .map(|sk| sk.get_safekeeper_id()) + .collect::>(); + + MemberSet::new(members).map_err(ApiError::InternalServerError) + } + + fn get_safekeepers(&self, ids: &[i64]) -> Result, ApiError> { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + ids.iter() + .map(|&id| { + let node_id = NodeId(id as u64); + safekeepers.get(&node_id).cloned().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "safekeeper {node_id} is not registered" + )) + }) + }) + .collect::, _>>() + } + /// Timeline creation on safekeepers /// /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, @@ -47,35 +83,9 @@ impl Service { pg_version: PgVersionId, timeline_persistence: &TimelinePersistence, ) -> Result, ApiError> { - // If quorum is reached, return if we are outside of a specified timeout - let jwt = self - .config - .safekeeper_jwt_token - .clone() - .map(SecretString::from); - let mut joinset = JoinSet::new(); + let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?; - // Prepare membership::Configuration from choosen safekeepers. - let safekeepers = { - let locked = self.inner.read().unwrap(); - locked.safekeepers.clone() - }; - - let mut members = Vec::new(); - for sk_id in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk_id as u64); - let Some(safekeeper) = safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find entry for safekeeper with id {sk_id}" - )))?; - }; - members.push(SafekeeperId { - id: sk_id, - host: safekeeper.skp.host.clone(), - pg_port: safekeeper.skp.port as u16, - }); - } - let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mset = Self::make_member_set(&safekeepers)?; let mconf = safekeeper_api::membership::Configuration::new(mset); let req = safekeeper_api::models::TimelineCreateRequest { @@ -88,79 +98,150 @@ impl Service { timeline_id, wal_seg_size: None, }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - for sk in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk as u64); - let safekeepers = safekeepers.clone(); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + &safekeepers, + move |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + Ok(results + .into_iter() + .enumerate() + .filter_map(|(idx, res)| { + if res.is_ok() { + None // Success, don't return this safekeeper + } else { + Some(safekeepers[idx].get_id()) // Failure, return this safekeeper + } + }) + .collect::>()) + } + + /// Perform an operation on a list of safekeepers in parallel with retries. + /// + /// Return the results of the operation on each safekeeper in the input order. + async fn tenant_timeline_safekeeper_op( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F + Send + 'static, + O: Clone, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + for (idx, sk) in safekeepers.iter().enumerate() { + let sk = sk.clone(); let http_client = self.http_client.clone(); let jwt = jwt.clone(); - let req = req.clone(); + let op = op.clone(); joinset.spawn(async move { - // Unwrap is fine as we already would have returned error above - let sk_p = safekeepers.get(&sk_id).unwrap(); - let res = sk_p + let res = sk .with_client_retries( - |client| { - let req = req.clone(); - async move { client.create_timeline(&req).await } - }, + op, &http_client, &jwt, 3, 3, - SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + // TODO(diko): This is a wrong timeout. + // It should be scaled to the retry count. + timeout, &CancellationToken::new(), ) .await; - (sk_id, sk_p.skp.host.clone(), res) + (idx, res) }); } + + // Initialize results with timeout errors in case we never get a response. + let mut results: Vec> = safekeepers + .iter() + .map(|_| { + Err(mgmt_api::Error::Timeout( + "safekeeper operation timed out".to_string(), + )) + }) + .collect(); + // After we have built the joinset, we now wait for the tasks to complete, // but with a specified timeout to make sure we return swiftly, either with // a failure or success. - let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + let reconcile_deadline = tokio::time::Instant::now() + timeout; // Wait until all tasks finish or timeout is hit, whichever occurs // first. - let mut reconcile_results = Vec::new(); + let mut result_count = 0; loop { if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await { let Some(res) = res else { break }; match res { - Ok(res) => { + Ok((idx, res)) => { + let sk = &safekeepers[idx]; tracing::info!( "response from safekeeper id:{} at {}: {:?}", - res.0, - res.1, - res.2 + sk.get_id(), + sk.skp.host, + // Only print errors, as there is no Debug trait for T. + res.as_ref().map(|_| ()), ); - reconcile_results.push(res); + results[idx] = res; + result_count += 1; } Err(join_err) => { tracing::info!("join_err for task in joinset: {join_err}"); } } } else { - tracing::info!( - "timeout for creation call after {} responses", - reconcile_results.len() - ); + tracing::info!("timeout for operation call after {result_count} responses",); break; } } - // Now check now if quorum was reached in reconcile_results. - let total_result_count = reconcile_results.len(); - let remaining = reconcile_results - .into_iter() - .filter_map(|res| res.2.is_err().then_some(res.0)) - .collect::>(); - tracing::info!( - "Got {} non-successful responses from initial creation request of total {total_result_count} responses", - remaining.len() - ); - let target_sk_count = timeline_persistence.sk_set.len(); + Ok(results) + } + + /// Perform an operation on a list of safekeepers in parallel with retries, + /// and validates that we reach a quorum of successful responses. + /// + /// Return the results of the operation on each safekeeper in the input order. + /// It's guaranteed that at least a quorum of the responses are successful. + async fn tenant_timeline_safekeeper_op_quorum( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F, + O: Clone + Send + 'static, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let results = self + .tenant_timeline_safekeeper_op(safekeepers, op, timeout) + .await?; + + // Now check if quorum was reached in results. + + let target_sk_count = safekeepers.len(); let quorum_size = match target_sk_count { 0 => { return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -179,7 +260,7 @@ impl Service { // in order to schedule work to them tracing::warn!( "couldn't find at least 3 safekeepers for timeline, found: {:?}", - timeline_persistence.sk_set + target_sk_count ); return Err(ApiError::InternalServerError(anyhow::anyhow!( "couldn't find at least 3 safekeepers to put timeline to" @@ -188,7 +269,7 @@ impl Service { } _ => target_sk_count / 2 + 1, }; - let success_count = target_sk_count - remaining.len(); + let success_count = results.iter().filter(|res| res.is_ok()).count(); if success_count < quorum_size { // Failure return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -196,7 +277,7 @@ impl Service { ))); } - Ok(remaining) + Ok(results) } /// Create timeline in controller database and on safekeepers. @@ -797,4 +878,435 @@ impl Service { } Ok(()) } + + /// Call `switch_timeline_membership` on all safekeepers with retries + /// till the quorum of successful responses is reached. + /// + /// If min_position is not None, validates that majority of safekeepers + /// reached at least min_position. + /// + /// Return responses from safekeepers in the input order. + async fn tenant_timeline_set_membership_quorum( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + min_position: Option<(Term, Lsn)>, + ) -> Result>, ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + safekeepers, + move |client| { + let req = req.clone(); + async move { + let mut res = client + .switch_timeline_membership(tenant_id, timeline_id, &req) + .await; + + // If min_position is not reached, map the response to an error, + // so it isn't counted toward the quorum. + if let Some(min_position) = min_position { + if let Ok(ok_res) = &res { + if (ok_res.term, ok_res.flush_lsn) < min_position { + // Use Error::Timeout to make this error retriable. + res = Err(mgmt_api::Error::Timeout( + format!( + "safekeeper {} returned position {:?} which is less than minimum required position {:?}", + client.node_id_label(), + (ok_res.term, ok_res.flush_lsn), + min_position + ) + )); + } + } + } + + res + } + }, + SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + for res in results.iter().flatten() { + if res.current_conf.generation > config.generation { + // Antoher switch_membership raced us. + return Err(ApiError::Conflict(format!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, config.generation + ))); + } else if res.current_conf.generation < config.generation { + // Note: should never happen. + // If we get a response, it should be at least the sent generation. + tracing::error!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ))); + } + } + + Ok(results) + } + + /// Pull timeline to to_safekeepers from from_safekeepers with retries. + /// + /// Returns Ok(()) only if all the pull_timeline requests were successful. + async fn tenant_timeline_pull_from_peers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + to_safekeepers: &[Safekeeper], + from_safekeepers: &[Safekeeper], + ) -> Result<(), ApiError> { + let http_hosts = from_safekeepers + .iter() + .map(|sk| sk.base_url()) + .collect::>(); + + tracing::info!( + "pulling timeline to {:?} from {:?}", + to_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>(), + from_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>() + ); + + // TODO(diko): need to pass mconf/generation with the request + // to properly handle tombstones. Ignore tombstones for now. + // Worst case: we leave a timeline on a safekeeper which is not in the current set. + let req = PullTimelineRequest { + tenant_id, + timeline_id, + http_hosts, + ignore_tombstone: Some(true), + }; + + const SK_PULL_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let responses = self + .tenant_timeline_safekeeper_op( + to_safekeepers, + move |client| { + let req = req.clone(); + async move { client.pull_timeline(&req).await } + }, + SK_PULL_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + if let Some((idx, err)) = responses + .iter() + .enumerate() + .find_map(|(idx, res)| Some((idx, res.as_ref().err()?))) + { + let sk_id = to_safekeepers[idx].get_id(); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "pull_timeline to {sk_id} failed: {err}", + ))); + } + + Ok(()) + } + + /// Exclude a timeline from safekeepers in parallel with retries. + /// If an exclude request is unsuccessful, it will be added to + /// the reconciler, and after that the function will succeed. + async fn tenant_timeline_safekeeper_exclude( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + ) -> Result<(), ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op( + safekeepers, + move |client| { + let req = req.clone(); + async move { client.exclude_timeline(tenant_id, timeline_id, &req).await } + }, + SK_EXCLUDE_TIMELINE_TIMEOUT, + ) + .await?; + + let mut reconcile_requests = Vec::new(); + + for (idx, res) in results.iter().enumerate() { + if res.is_err() { + let sk_id = safekeepers[idx].skp.id; + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: config.generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + sk_id, + }; + tracing::info!("writing pending exclude op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + + let req = ScheduleRequest { + safekeeper: Box::new(safekeepers[idx].clone()), + host_list: Vec::new(), + tenant_id, + timeline_id: Some(timeline_id), + generation: config.generation.into_inner(), + kind: SafekeeperTimelineOpKind::Exclude, + }; + reconcile_requests.push(req); + } + } + + if !reconcile_requests.is_empty() { + let locked = self.inner.read().unwrap(); + for req in reconcile_requests { + locked.safekeeper_reconcilers.schedule_request(req); + } + } + + Ok(()) + } + + /// Migrate timeline safekeeper set to a new set. + /// + /// This function implements an algorithm from RFC-035. + /// + pub(crate) async fn tenant_timeline_safekeeper_migrate( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineSafekeeperMigrateRequest, + ) -> Result<(), ApiError> { + let all_safekeepers = self.inner.read().unwrap().safekeepers.clone(); + + let new_sk_set = req.new_sk_set; + + for sk_id in new_sk_set.iter() { + if !all_safekeepers.contains_key(sk_id) { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "safekeeper {sk_id} does not exist" + ))); + } + } + + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineSafekeeperMigrate, + ) + .await; + + // 1. Fetch current timeline configuration from the configuration storage. + + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table" + ) + .into(), + )); + }; + + let cur_sk_set = timeline + .sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .collect::>(); + + tracing::info!( + ?cur_sk_set, + ?new_sk_set, + "Migrating timeline to new safekeeper set", + ); + + let mut generation = SafekeeperGeneration::new(timeline.generation as u32); + + if let Some(ref presistent_new_sk_set) = timeline.new_sk_set { + // 2. If it is already joint one and new_set is different from desired_set refuse to change. + if presistent_new_sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .ne(new_sk_set.iter().cloned()) + { + tracing::info!( + ?presistent_new_sk_set, + ?new_sk_set, + "different new safekeeper set is already set in the database", + ); + return Err(ApiError::Conflict(format!( + "the timeline is already migrating to a different safekeeper set: {presistent_new_sk_set:?}" + ))); + } + // It it is the same new_sk_set, we can continue the migration (retry). + } else { + // 3. No active migration yet. + // Increment current generation and put desired_set to new_sk_set. + generation = generation.next(); + + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &cur_sk_set, + Some(&new_sk_set), + ) + .await?; + } + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?; + + let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::>(); + let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?; + let new_sk_member_set = Self::make_member_set(&new_safekeepers)?; + + let joint_config = membership::Configuration { + generation, + members: cur_sk_member_set, + new_members: Some(new_sk_member_set.clone()), + }; + + // 4. Call PUT configuration on safekeepers from the current set, + // delivering them joint_conf. + + // TODO(diko): need to notify cplane with an updated set of safekeepers. + + let results = self + .tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &cur_safekeepers, + &joint_config, + None, // no min position + ) + .await?; + + let mut sync_position = (INITIAL_TERM, Lsn::INVALID); + for res in results.into_iter().flatten() { + let sk_position = (res.term, res.flush_lsn); + if sync_position < sk_position { + sync_position = sk_position; + } + } + + tracing::info!( + %generation, + ?sync_position, + "safekeepers set membership updated", + ); + + // 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet + // by doing pull_timeline from the majority of the current set. + + // Filter out safekeepers which are already in the current set. + let from_ids: HashSet = cur_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let pull_to_safekeepers = new_safekeepers + .iter() + .filter(|sk| !from_ids.contains(&sk.get_id())) + .cloned() + .collect::>(); + + self.tenant_timeline_pull_from_peers( + tenant_id, + timeline_id, + &pull_to_safekeepers, + &cur_safekeepers, + ) + .await?; + + // 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough. + + // TODO(diko): do we need to bump timeline term? + + // 7. Repeatedly call PUT configuration on safekeepers from the new set, + // delivering them joint_conf and collecting their positions. + + tracing::info!(?sync_position, "waiting for safekeepers to sync position"); + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &joint_config, + Some(sync_position), + ) + .await?; + + // 8. Create new_conf: Configuration incrementing joint_conf generation and + // having new safekeeper set as sk_set and None new_sk_set. + + let generation = generation.next(); + + let new_conf = membership::Configuration { + generation, + members: new_sk_member_set, + new_members: None, + }; + + self.persistence + .update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None) + .await?; + + // TODO(diko): at this point we have already updated the timeline in the database, + // but we still need to notify safekeepers and cplane about the new configuration, + // and put delition of the timeline from the old safekeepers into the reconciler. + // Ideally it should be done atomically, but now it's not. + // Worst case: the timeline is not deleted from old safekeepers, + // the compute may require both quorums till the migration is retried and completed. + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &new_conf, + None, // no min position + ) + .await?; + + let new_ids: HashSet = new_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let exclude_safekeepers = cur_safekeepers + .into_iter() + .filter(|sk| !new_ids.contains(&sk.get_id())) + .collect::>(); + self.tenant_timeline_safekeeper_exclude( + tenant_id, + timeline_id, + &exclude_safekeepers, + &new_conf, + ) + .await?; + + // TODO(diko): need to notify cplane with an updated set of safekeepers. + + Ok(()) + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 48c6597c7c..508e3d8dd2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1215,6 +1215,13 @@ class NeonEnv: storage_controller_config = storage_controller_config or {} storage_controller_config["use_https_safekeeper_api"] = True + # TODO(diko): uncomment when timeline_safekeeper_count option is in the release branch, + # so the compat tests will not fail bacause of it presence. + # if config.num_safekeepers < 3: + # storage_controller_config = storage_controller_config or {} + # if "timeline_safekeeper_count" not in storage_controller_config: + # storage_controller_config["timeline_safekeeper_count"] = config.num_safekeepers + if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config @@ -2226,6 +2233,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"timeline_create success: {response.json()}") + def migrate_safekeepers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_sk_set: list[int], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", + json={"new_sk_set": new_sk_set}, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"migrate_safekeepers success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py new file mode 100644 index 0000000000..f67b6afc95 --- /dev/null +++ b/test_runner/regress/test_safekeeper_migration.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): + """ + Simple safekeeper migration test. + Creates 3 safekeepers. The timeline is configuret to use only one safekeeper. + 1. Go through all safekeepers, migrate the timeline to it. + 2. Stop the other safekeepers. Validate that the insert is successful. + 3. Start the other safekeepers again and go to the next safekeeper. + 4. Validate that the table contains all inserted values. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + # TODO(diko): pageserver spams with various errors during safekeeper migration. + # Fix the code so it handles the migration better. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was cancelled and cannot be used anymore.*", + ".*Timeline .* has been deleted.*", + ".*wal receiver task finished with an error.*", + ] + ) + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + # We specify all safekeepers, so compute will connect to all of them. + # Only those from the current membership configuration will be used. + # TODO(diko): set only current safekeepers when cplane notify is implemented. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + + for active_sk in range(1, 4): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [active_sk] + ) + + other_sks = [sk for sk in range(1, 4) if sk != active_sk] + + for sk in other_sks: + env.safekeepers[sk - 1].stop() + + ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})") + + for sk in other_sks: + env.safekeepers[sk - 1].start() + + ep.clear_buffers() + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + + ep.stop() + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] From 1d43f3bee80f1b8ffd7e6c6576e7c280510996c7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 11:08:44 +0200 Subject: [PATCH 44/50] pageserver: fix stripe size persistence in legacy HTTP handlers (#12377) ## Problem Similarly to #12217, the following endpoints may result in a stripe size mismatch between the storage controller and Pageserver if an unsharded tenant has a different stripe size set than the default. This can lead to data corruption if the tenant is later manually split without specifying an explicit stripe size, since the storage controller and Pageserver will apply different defaults. This commonly happens with tenants that were created before the default stripe size was changed from 32k to 2k. * `PUT /v1/tenant/config` * `PATCH /v1/tenant/config` These endpoints are no longer in regular production use (they were used when cplane still managed Pageserver directly), but can still be called manually or by tests. ## Summary of changes Retain the current shard parameters when updating the location config in `PUT | PATCH /v1/tenant/config`. Also opportunistically derive `Copy` for `ShardParameters`. --- libs/pageserver_api/src/models.rs | 15 +++++++++++++-- libs/pageserver_api/src/shard.rs | 2 +- pageserver/src/http/routes.rs | 4 ++-- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/tenant.rs | 6 +++++- pageserver/src/tenant/config.rs | 2 +- pageserver/src/tenant/timeline/handle.rs | 14 +++++++------- safekeeper/src/handler.rs | 2 +- storage_controller/src/service.rs | 4 ++-- 9 files changed, 33 insertions(+), 18 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 82a3ac0eb4..16545364c1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -21,7 +21,9 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; -use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{ + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId, +}; /// The state of a tenant in this pageserver. /// @@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse { } /// Parameters that apply to all shards in a tenant. Used during tenant creation. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Clone, Copy, Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct ShardParameters { pub count: ShardCount, @@ -497,6 +499,15 @@ impl Default for ShardParameters { } } +impl From for ShardParameters { + fn from(identity: ShardIdentity) -> Self { + Self { + count: identity.count, + stripe_size: identity.stripe_size, + } + } +} + #[derive(Debug, Default, Clone, Eq, PartialEq)] pub enum FieldPatch { Upsert(T), diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 9c16be93e8..a9fe3dac43 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -179,7 +179,7 @@ impl ShardIdentity { /// For use when creating ShardIdentity instances for new shards, where a creation request /// specifies the ShardParameters that apply to all shards. - pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self { + pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self { Self { number, count: params.count, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index aa9bec657c..f770e420f0 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1893,7 +1893,7 @@ async fn update_tenant_config_handler( let location_conf = LocationConf::attached_single( new_tenant_conf.clone(), tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) @@ -1937,7 +1937,7 @@ async fn patch_tenant_config_handler( let location_conf = LocationConf::attached_single( updated, tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 09a7a8a651..31f38d485f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -3015,7 +3015,7 @@ mod tests { // This shard will get the even blocks let shard = ShardIdentity::from_params( ShardNumber(0), - &ShardParameters { + ShardParameters { count: ShardCount(2), stripe_size: ShardStripeSize(1), }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2e9dbdc539..79bea4eb77 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3872,6 +3872,10 @@ impl TenantShard { &self.tenant_shard_id } + pub(crate) fn get_shard_identity(&self) -> ShardIdentity { + self.shard_identity + } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { self.shard_identity.stripe_size } @@ -6008,7 +6012,7 @@ pub(crate) mod harness { AttachedTenantConf::try_from(LocationConf::attached_single( self.tenant_conf.clone(), self.generation, - &ShardParameters::default(), + ShardParameters::default(), )) .unwrap(), self.shard_identity, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c5087f7e0f..46cc669400 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -136,7 +136,7 @@ impl LocationConf { pub(crate) fn attached_single( tenant_conf: pageserver_api::models::TenantConfig, generation: Generation, - shard_params: &models::ShardParameters, + shard_params: models::ShardParameters, ) -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 809b350f38..2dbff20ab2 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -887,7 +887,7 @@ mod tests { .expect("we still have it"); } - fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key { rel_block_to_key( RelTag { spcnode: 1663, @@ -917,14 +917,14 @@ mod tests { let child0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + shard: ShardIdentity::from_params(ShardNumber(0), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child1 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + shard: ShardIdentity::from_params(ShardNumber(1), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); @@ -937,7 +937,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -961,7 +961,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![], // doesn't matter what's in here, the cache is fully loaded }, @@ -978,7 +978,7 @@ mod tests { let parent_handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -995,7 +995,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop }, diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 5e7f1d8758..373589a18e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -220,7 +220,7 @@ impl postgres_backend::Handler stripe_size: ShardStripeSize(stripe_size), }; self.shard = - Some(ShardIdentity::from_params(ShardNumber(number), ¶ms)); + Some(ShardIdentity::from_params(ShardNumber(number), params)); } _ => { return Err(QueryError::Other(anyhow::anyhow!( diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index bbf93fd751..e0b13c4e63 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2584,7 +2584,7 @@ impl Service { .do_initial_shard_scheduling( tenant_shard_id, initial_generation, - &create_req.shard_parameters, + create_req.shard_parameters, create_req.config.clone(), placement_policy.clone(), preferred_az_id.as_ref(), @@ -2641,7 +2641,7 @@ impl Service { &self, tenant_shard_id: TenantShardId, initial_generation: Option, - shard_params: &ShardParameters, + shard_params: ShardParameters, config: TenantConfig, placement_policy: PlacementPolicy, preferred_az_id: Option<&AvailabilityZone>, From 620d50432cbadc240b834b65be3f221c462e29b0 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Mon, 30 Jun 2025 11:33:57 +0200 Subject: [PATCH 45/50] Fix path issue in the proxy-bernch CI workflow (#12388) --- .github/workflows/proxy-benchmark.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml index 75ecacaced..3a98ad4e8e 100644 --- a/.github/workflows/proxy-benchmark.yml +++ b/.github/workflows/proxy-benchmark.yml @@ -60,22 +60,23 @@ jobs: } >> "$GITHUB_ENV" - name: Run proxy-bench - run: ./${PROXY_BENCH_PATH}/run.sh + run: ${PROXY_BENCH_PATH}/run.sh - name: Ingest Bench Results # neon repo script - if: success() + if: always() run: | mkdir -p $TEST_OUTPUT python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT - name: Push Metrics to Proxy perf database - if: success() + if: always() env: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" REPORT_FROM: $TEST_OUTPUT run: $NEON_DIR/scripts/generate_and_push_perf_report.sh - name: Docker cleanup + if: always() run: docker compose down - name: Notify Failure From 2af938096273cdd610cf58973c0a230ed2d894a9 Mon Sep 17 00:00:00 2001 From: Busra Kugler Date: Mon, 30 Jun 2025 12:15:10 +0200 Subject: [PATCH 46/50] Revert "Replace step-security maintained actions" (#12386) Reverts neondatabase/neon#11663 and https://github.com/neondatabase/neon/pull/11265/ Step Security is not yet approved by Databricks team, in order to prevent issues during Github org migration, I'll revert this PR to use the previous action instead of Step Security maintained action. --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/neon_extra_builds.yml | 2 +- .github/workflows/pre-merge-checks.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 94f768719f..456c7b8c92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 3427a0eb49..3e81183687 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 id: files_changed with: token: ${{ github.token }} diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 6fb4753fc0..23b8573097 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -34,7 +34,7 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: python-src with: files: | @@ -45,7 +45,7 @@ jobs: poetry.lock pyproject.toml - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: rust-src with: files: | From 66f53d9d348e2d2c39d5b180798b58befde09f4a Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 30 Jun 2025 13:03:48 +0200 Subject: [PATCH 47/50] refactor(pageserver): force explicit mapping to `CreateImageLayersError::Other` (#12382) Implicit mapping to an `anyhow::Error` when we do `?` is discouraged because tooling to find those places isn't great. As a drive-by, also make SplitImageLayerWriter::new infallible and sync. I think we should also make ImageLayerWriter::new completely lazy, then `BatchLayerWriter:new` infallible and async. --- .../storage_layer/batch_split_writer.rs | 35 ++++++++----------- pageserver/src/tenant/timeline.rs | 12 ++++--- pageserver/src/tenant/timeline/compaction.rs | 21 ++++++----- 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 1d50a5f3a0..9c2a177f3a 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -55,11 +55,11 @@ pub struct BatchLayerWriter { } impl BatchLayerWriter { - pub async fn new(conf: &'static PageServerConf) -> anyhow::Result { - Ok(Self { + pub fn new(conf: &'static PageServerConf) -> Self { + Self { generated_layer_writers: Vec::new(), conf, - }) + } } pub fn add_unfinished_image_writer( @@ -209,6 +209,7 @@ impl<'a> SplitImageLayerWriter<'a> { ) -> anyhow::Result { Ok(Self { target_layer_size, + // XXX make this lazy like in SplitDeltaLayerWriter? inner: ImageLayerWriter::new( conf, timeline_id, @@ -223,7 +224,7 @@ impl<'a> SplitImageLayerWriter<'a> { conf, timeline_id, tenant_shard_id, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), lsn, start_key, gate, @@ -319,7 +320,7 @@ pub struct SplitDeltaLayerWriter<'a> { } impl<'a> SplitDeltaLayerWriter<'a> { - pub async fn new( + pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -327,8 +328,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, - ) -> anyhow::Result { - Ok(Self { + ) -> Self { + Self { target_layer_size, inner: None, conf, @@ -336,10 +337,10 @@ impl<'a> SplitDeltaLayerWriter<'a> { tenant_shard_id, lsn_range, last_key_written: Key::MIN, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), gate, cancel, - }) + } } pub async fn put_value( @@ -510,9 +511,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -590,9 +589,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); const N: usize = 2000; for i in 0..N { let i = i as u32; @@ -692,9 +689,7 @@ mod tests { 4 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -770,9 +765,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); for i in 0..N { let i = i as u32; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 7261ce783d..08bc6d4a59 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -763,7 +763,7 @@ pub(crate) enum CreateImageLayersError { PageReconstructError(#[source] PageReconstructError), #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CreateImageLayersError { @@ -5590,7 +5590,7 @@ impl Timeline { self.should_check_if_image_layers_required(lsn) }; - let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + let mut batch_image_writer = BatchLayerWriter::new(self.conf); let mut all_generated = true; @@ -5694,7 +5694,8 @@ impl Timeline { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -5789,7 +5790,10 @@ impl Timeline { } } - let image_layers = batch_image_writer.finish(self, ctx).await?; + let image_layers = batch_image_writer + .finish(self, ctx) + .await + .map_err(CreateImageLayersError::Other)?; let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 1b8e5f4b9c..02bc4f6bdf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3531,10 +3531,7 @@ impl Timeline { self.get_compaction_target_size(), &self.gate, self.cancel.clone(), - ) - .await - .context("failed to create delta layer writer") - .map_err(CompactionError::Other)?; + ); #[derive(Default)] struct RewritingLayers { @@ -4330,7 +4327,8 @@ impl TimelineAdaptor { self.timeline.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -4339,7 +4337,10 @@ impl TimelineAdaptor { }); let keyspace = KeySpace { - ranges: self.get_keyspace(key_range, lsn, ctx).await?, + ranges: self + .get_keyspace(key_range, lsn, ctx) + .await + .map_err(CreateImageLayersError::Other)?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly let outcome = self @@ -4358,9 +4359,13 @@ impl TimelineAdaptor { unfinished_image_layer, } = outcome { - let (desc, path) = unfinished_image_layer.finish(ctx).await?; + let (desc, path) = unfinished_image_layer + .finish(ctx) + .await + .map_err(CreateImageLayersError::Other)?; let image_layer = - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path) + .map_err(CreateImageLayersError::Other)?; self.new_images.push(image_layer); } From a384d7d501077e6f3cbe32f6ee69454f51f4b2e5 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 14:36:45 +0200 Subject: [PATCH 48/50] pageserver: assert no changes to shard identity (#12379) ## Problem Location config changes can currently result in changes to the shard identity. Such changes will cause data corruption, as seen with #12217. Resolves #12227. Requires #12377. ## Summary of changes Assert that the shard identity does not change on location config updates and on (re)attach. This is currently asserted with `critical!`, in case it misfires in production. Later, we should reject such requests with an error and turn this into a proper assertion. --- Cargo.lock | 1 + libs/pageserver_api/Cargo.toml | 5 +++-- libs/pageserver_api/src/shard.rs | 12 ++++++++++++ pageserver/src/http/routes.rs | 8 ++++++++ pageserver/src/tenant.rs | 4 ++++ pageserver/src/tenant/config.rs | 11 +++++++++++ pageserver/src/tenant/mgr.rs | 20 +++++++++++++++++--- pageserver/src/tenant/secondary.rs | 2 +- 8 files changed, 57 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71e78243a6..9ef0a0ae0a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4421,6 +4421,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tracing", "tracing-utils", "utils", ] diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 6dc17b670b..7accbdabca 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -30,12 +30,13 @@ humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true storage_broker.workspace = true -camino = {workspace = true, features = ["serde1"]} +camino = { workspace = true, features = ["serde1"] } remote_storage.workspace = true postgres_backend.workspace = true -nix = {workspace = true, optional = true} +nix = { workspace = true, optional = true } reqwest.workspace = true rand.workspace = true +tracing.workspace = true tracing-utils.workspace = true once_cell.workspace = true diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index a9fe3dac43..5a13aace64 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher}; pub use ::utils::shard::*; use postgres_ffi_types::forknum::INIT_FORKNUM; use serde::{Deserialize, Serialize}; +use utils::critical; use crate::key::Key; use crate::models::ShardParameters; @@ -188,6 +189,17 @@ impl ShardIdentity { } } + /// Asserts that the given shard identities are equal. Changes to shard parameters will likely + /// result in data corruption. + pub fn assert_equal(&self, other: ShardIdentity) { + if self != &other { + // TODO: for now, we're conservative and just log errors in production. Turn this into a + // real assertion when we're confident it doesn't misfire, and also reject requests that + // attempt to change it with an error response. + critical!("shard identity mismatch: {self:?} != {other:?}"); + } + } + fn is_broken(&self) -> bool { self.layout == LAYOUT_BROKEN } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f770e420f0..119275f885 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1896,6 +1896,10 @@ async fn update_tenant_config_handler( ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; @@ -1940,6 +1944,10 @@ async fn patch_tenant_config_handler( ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 79bea4eb77..fcb18e8553 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4529,6 +4529,10 @@ impl TenantShard { Ok(toml_edit::de::from_str::(&config)?) } + /// Stores a tenant location config to disk. + /// + /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid + /// changes to shard parameters that may result in data corruption. #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 46cc669400..67df767abd 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -12,6 +12,7 @@ use pageserver_api::models; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::{Deserialize, Serialize}; +use utils::critical; use utils::generation::Generation; #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -171,6 +172,16 @@ impl LocationConf { } } + // This should never happen. + // TODO: turn this into a proper assertion. + if stripe_size != self.shard.stripe_size { + critical!( + "stripe size mismatch: {} != {}", + self.shard.stripe_size, + stripe_size, + ); + } + self.shard.stripe_size = stripe_size; } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 248d92622e..95f5c60170 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -880,6 +880,9 @@ impl TenantManager { // phase of writing config and/or waiting for flush, before returning. match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -914,7 +917,10 @@ impl TenantManager { return Ok(Some(tenant)); } - Some(FastPathModified::Secondary(_secondary_tenant)) => { + Some(FastPathModified::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -948,6 +954,10 @@ impl TenantManager { match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); + // The case where we keep a Tenant alive was covered above in the special case // for Attached->Attached transitions in the same generation. By this point, // if we see an attached tenant we know it will be discarded and should be @@ -981,9 +991,13 @@ impl TenantManager { // rather than assuming it to be empty. spawn_mode = SpawnMode::Eager; } - Some(TenantSlot::Secondary(state)) => { + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); + info!("Shutting down secondary tenant"); - state.shutdown().await; + secondary_tenant.shutdown().await; } Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 2fa0ed9be9..e06788543a 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant { // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However, // storing these enables us to report our full LocationConf, enabling convenient reconciliation // by the control plane (see [`Self::get_location_conf`]) - shard_identity: ShardIdentity, + pub(crate) shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, // Internal state used by the Downloader. From d0a4ae3e8f40c76eb475978aaba4d374525d5ee8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 30 Jun 2025 14:44:17 +0200 Subject: [PATCH 49/50] pageserver: add gRPC LSN lease support (#12384) ## Problem The gRPC API does not provide LSN leases. ## Summary of changes * Add LSN lease support to the gRPC API. * Use gRPC LSN leases for static computes with `grpc://` connstrings. * Move `PageserverProtocol` into the `compute_api::spec` module and reuse it. --- Cargo.lock | 13 +- Cargo.toml | 1 + compute_tools/src/compute.rs | 19 +-- compute_tools/src/lsn_lease.rs | 164 +++++++++++-------- control_plane/src/bin/neon_local.rs | 4 +- control_plane/src/endpoint.rs | 27 +-- libs/compute_api/Cargo.toml | 1 + libs/compute_api/src/spec.rs | 44 +++++ pageserver/page_api/Cargo.toml | 1 + pageserver/page_api/proto/page_service.proto | 20 +++ pageserver/page_api/src/client.rs | 13 ++ pageserver/page_api/src/model.rs | 52 ++++++ pageserver/src/page_service.rs | 31 ++++ storage_controller/Cargo.toml | 1 + storage_controller/src/compute_hook.rs | 3 +- 15 files changed, 281 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9ef0a0ae0a..e640e62909 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1279,6 +1279,7 @@ dependencies = [ "remote_storage", "serde", "serde_json", + "url", "utils", ] @@ -4480,6 +4481,7 @@ dependencies = [ "pageserver_api", "postgres_ffi_types", "prost 0.13.5", + "prost-types 0.13.5", "strum", "strum_macros", "thiserror 1.0.69", @@ -5157,7 +5159,7 @@ dependencies = [ "petgraph", "prettyplease", "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "regex", "syn 2.0.100", "tempfile", @@ -5200,9 +5202,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost 0.13.5", ] @@ -6809,6 +6811,7 @@ dependencies = [ "chrono", "clap", "clashmap", + "compute_api", "control_plane", "cron", "diesel", @@ -7642,7 +7645,7 @@ dependencies = [ "prettyplease", "proc-macro2", "prost-build 0.13.3", - "prost-types 0.13.3", + "prost-types 0.13.5", "quote", "syn 2.0.100", ] @@ -7654,7 +7657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1" dependencies = [ "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "tokio", "tokio-stream", "tonic 0.13.1", diff --git a/Cargo.toml b/Cargo.toml index aeb7976b6c..7728f6d8fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,6 +152,7 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13.5" +prost-types = "0.13.5" rand = "0.8" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7566626d57..381f2d45ba 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,4 @@ -use anyhow::{Context, Result, anyhow}; +use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ @@ -6,7 +6,7 @@ use compute_api::responses::{ LfcPrewarmState, TlsConfig, }; use compute_api::spec::{ - ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, }; use futures::StreamExt; use futures::future::join_all; @@ -1003,19 +1003,12 @@ impl ComputeNode { fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); - // Detect the protocol scheme. If the URL doesn't have a scheme, assume libpq. let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); - let scheme = match Url::parse(shard0_connstr) { - Ok(url) => url.scheme().to_lowercase().to_string(), - Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), - Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), - }; - let started = Instant::now(); - let (connected, size) = match scheme.as_str() { - "postgresql" | "postgres" => self.try_get_basebackup_libpq(spec, lsn)?, - "grpc" => self.try_get_basebackup_grpc(spec, lsn)?, - scheme => return Err(anyhow!("unknown URL scheme {scheme}")), + + let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? { + PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, + PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, }; let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 930dbc52b4..3346c18c0d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -4,7 +4,9 @@ use std::thread; use std::time::{Duration, SystemTime}; use anyhow::{Result, bail}; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; +use itertools::Itertools as _; +use pageserver_page_api as page_api; use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; use utils::id::{TenantId, TimelineId}; @@ -76,25 +78,17 @@ fn acquire_lsn_lease_with_retry( loop { // Note: List of pageservers is dynamic, need to re-read configs before each attempt. - let configs = { + let (connstrings, auth) = { let state = compute.state.lock().unwrap(); - let spec = state.pspec.as_ref().expect("spec must be set"); - - let conn_strings = spec.pageserver_connstr.split(','); - - conn_strings - .map(|connstr| { - let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); - if let Some(storage_auth_token) = &spec.storage_auth_token { - config.password(storage_auth_token.clone()); - } - config - }) - .collect::>() + ( + spec.pageserver_connstr.clone(), + spec.storage_auth_token.clone(), + ) }; - let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + let result = + try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn); match result { Ok(Some(res)) => { return Ok(res); @@ -116,68 +110,104 @@ fn acquire_lsn_lease_with_retry( } } -/// Tries to acquire an LSN lease through PS page_service API. +/// Tries to acquire LSN leases on all Pageserver shards. fn try_acquire_lsn_lease( + connstrings: &str, + auth: Option<&str>, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, - configs: &[postgres::Config], ) -> Result> { - fn get_valid_until( - config: &postgres::Config, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - lsn: Lsn, - ) -> Result> { - let mut client = config.connect(NoTls)?; - let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); - let res = client.simple_query(&cmd)?; - let msg = match res.first() { - Some(msg) => msg, - None => bail!("empty response"), - }; - let row = match msg { - SimpleQueryMessage::Row(row) => row, - _ => bail!("error parsing lsn lease response"), + let connstrings = connstrings.split(',').collect_vec(); + let shard_count = connstrings.len(); + let mut leases = Vec::new(); + + for (shard_number, &connstring) in connstrings.iter().enumerate() { + let tenant_shard_id = match shard_count { + 0 | 1 => TenantShardId::unsharded(tenant_id), + shard_count => TenantShardId { + tenant_id, + shard_number: ShardNumber(shard_number as u8), + shard_count: ShardCount::new(shard_count as u8), + }, }; - // Note: this will be None if a lease is explicitly not granted. - let valid_until_str = row.get("valid_until"); - - let valid_until = valid_until_str.map(|s| { - SystemTime::UNIX_EPOCH - .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) - .expect("Time larger than max SystemTime could handle") - }); - Ok(valid_until) + let lease = match PageserverProtocol::from_connstring(connstring)? { + PageserverProtocol::Libpq => { + acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + PageserverProtocol::Grpc => { + acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + }; + leases.push(lease); } - let shard_count = configs.len(); + Ok(leases.into_iter().min().flatten()) +} - let valid_until = if shard_count > 1 { - configs - .iter() - .enumerate() - .map(|(shard_number, config)| { - let tenant_shard_id = TenantShardId { - tenant_id, - shard_count: ShardCount::new(shard_count as u8), - shard_number: ShardNumber(shard_number as u8), - }; - get_valid_until(config, tenant_shard_id, timeline_id, lsn) - }) - .collect::>>>()? - .into_iter() - .min() - .unwrap() - } else { - get_valid_until( - &configs[0], - TenantShardId::unsharded(tenant_id), - timeline_id, - lsn, - )? +/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a +/// postgresql:// scheme. +fn acquire_lsn_lease_libpq( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + let mut config = postgres::Config::from_str(connstring)?; + if let Some(auth) = auth { + config.password(auth); + } + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), }; + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); Ok(valid_until) } + +/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a +/// grpc:// scheme. +fn acquire_lsn_lease_grpc( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + tokio::runtime::Handle::current().block_on(async move { + let mut client = page_api::Client::new( + connstring.to_string(), + tenant_shard_id.tenant_id, + timeline_id, + tenant_shard_id.to_index(), + auth.map(String::from), + None, + ) + .await?; + + let req = page_api::LeaseLsnRequest { lsn }; + match client.lease_lsn(req).await { + Ok(expires) => Ok(Some(expires)), + // Lease couldn't be acquired because the LSN has been garbage collected. + Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None), + Err(err) => Err(err.into()), + } + }) +} diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c818d07fef..de98d46a55 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,9 +16,9 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::requests::ComputeClaimsScope; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; use control_plane::broker::StorageBroker; -use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol}; +use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index e3faa082db..5ea55b28ef 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -56,8 +56,8 @@ use compute_api::responses::{ TlsConfig, }; use compute_api::spec::{ - Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, - RemoteExtSpec, Role, + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol, + PgIdent, RemoteExtSpec, Role, }; use jsonwebtoken::jwk::{ AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations, @@ -373,29 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode { } } -/// Protocol used to connect to a Pageserver. -#[derive(Clone, Copy, Debug)] -pub enum PageserverProtocol { - Libpq, - Grpc, -} - -impl PageserverProtocol { - /// Returns the URL scheme for the protocol, used in connstrings. - pub fn scheme(&self) -> &'static str { - match self { - Self::Libpq => "postgresql", - Self::Grpc => "grpc", - } - } -} - -impl Display for PageserverProtocol { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.scheme()) - } -} - impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 81b0cd19a1..83cb639f0a 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -12,6 +12,7 @@ jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true +url.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 0e23b70265..508040c5e5 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -4,11 +4,14 @@ //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or //! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; +use std::fmt::Display; +use anyhow::anyhow; use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; +use url::Url; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -429,6 +432,47 @@ pub struct JwksSettings { pub jwt_audience: Option, } +/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. +#[derive(Clone, Copy, Debug, Default)] +pub enum PageserverProtocol { + /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. + #[default] + Libpq, + /// A newer, gRPC-based protocol. Uses grpc:// scheme. + Grpc, +} + +impl PageserverProtocol { + /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given. + /// Errors if the connstring is an invalid URL. + pub fn from_connstring(connstring: &str) -> anyhow::Result { + let scheme = match Url::parse(connstring) { + Ok(url) => url.scheme().to_lowercase(), + Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()), + Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), + }; + match scheme.as_str() { + "postgresql" | "postgres" => Ok(Self::Libpq), + "grpc" => Ok(Self::Grpc), + scheme => Err(anyhow!("invalid protocol scheme: {scheme}")), + } + } + + /// Returns the URL scheme for the protocol, for use in connstrings. + pub fn scheme(&self) -> &'static str { + match self { + Self::Libpq => "postgresql", + Self::Grpc => "grpc", + } + } +} + +impl Display for PageserverProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.scheme()) + } +} + #[cfg(test)] mod tests { use std::fs::File; diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 42ee9b50e9..fbad8cf9d0 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -11,6 +11,7 @@ futures.workspace = true pageserver_api.workspace = true postgres_ffi_types.workspace = true prost.workspace = true +prost-types.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index d06b2cfca5..1d6c230916 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -35,6 +35,8 @@ syntax = "proto3"; package page_api; +import "google/protobuf/timestamp.proto"; + service PageService { // Returns whether a relation exists. rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); @@ -64,6 +66,10 @@ service PageService { // Fetches an SLRU segment. rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); + + // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage + // collect the LSN until the lease expires. Must be acquired on all relevant shards. + rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse); } // The LSN a request should read at. @@ -252,3 +258,17 @@ message GetSlruSegmentRequest { message GetSlruSegmentResponse { bytes segment = 1; } + +// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +// collect the LSN until the lease expires. Must be acquired on all relevant shards. +message LeaseLsnRequest { + // The LSN to lease. Can't be 0 or below the current GC cutoff. + uint64 lsn = 1; +} + +// Lease acquisition response. If the lease could not be granted because the LSN has already been +// garbage collected, a FailedPrecondition status will be returned instead. +message LeaseLsnResponse { + // The lease expiration time. + google.protobuf.Timestamp expires = 1; +} diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 4b456787d2..65e41540b8 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -187,4 +187,17 @@ impl Client { let response = self.client.get_slru_segment(proto_req).await?; Ok(response.into_inner().try_into()?) } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn( + &mut self, + req: model::LeaseLsnRequest, + ) -> Result { + let req = proto::LeaseLsnRequest::from(req); + Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) + } } diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 0493f79781..4497fc6fc7 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -16,6 +16,7 @@ //! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use bytes::Bytes; use postgres_ffi_types::Oid; @@ -703,3 +704,54 @@ impl From for proto::GetSlruSegmentResponse { // SlruKind is defined in pageserver_api::reltag. pub type SlruKind = pageserver_api::reltag::SlruKind; + +/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +/// collect the LSN until the lease expires. +pub struct LeaseLsnRequest { + /// The LSN to lease. + pub lsn: Lsn, +} + +impl TryFrom for LeaseLsnRequest { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnRequest) -> Result { + if pb.lsn == 0 { + return Err(ProtocolError::Missing("lsn")); + } + Ok(Self { lsn: Lsn(pb.lsn) }) + } +} + +impl From for proto::LeaseLsnRequest { + fn from(request: LeaseLsnRequest) -> Self { + Self { lsn: request.lsn.0 } + } +} + +/// Lease expiration time. If the lease could not be granted because the LSN has already been +/// garbage collected, a FailedPrecondition status will be returned instead. +pub type LeaseLsnResponse = SystemTime; + +impl TryFrom for LeaseLsnResponse { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnResponse) -> Result { + let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?; + UNIX_EPOCH + .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32)) + .ok_or_else(|| ProtocolError::invalid("expires", expires)) + } +} + +impl From for proto::LeaseLsnResponse { + fn from(response: LeaseLsnResponse) -> Self { + let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default(); + Self { + expires: Some(prost_types::Timestamp { + seconds: expires.as_secs() as i64, + nanos: expires.subsec_nanos() as i32, + }), + } + } +} diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c04f6e2b47..1d824ac846 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -14,6 +14,7 @@ use std::{io, str}; use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; +use chrono::Utc; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; use itertools::Itertools; @@ -3760,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler { let resp: page_api::GetSlruSegmentResponse = resp.segment; Ok(tonic::Response::new(resp.into())) } + + #[instrument(skip_all, fields(lsn))] + async fn lease_lsn( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + let timeline = self.get_request_timeline(&req).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + + // Validate and convert the request, and decorate the span. + let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?; + + span_record!(lsn=%req.lsn); + + // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted. + let lease_length = timeline.get_lsn_lease_length(); + let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) { + Ok(lease) => lease.valid_until, + Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))), + }; + + // TODO: is this spammy? Move it compute-side? + info!( + "acquired lease for {} until {}", + req.lsn, + chrono::DateTime::::from(expires).to_rfc3339() + ); + + Ok(tonic::Response::new(expires.into())) + } } /// gRPC middleware layer that handles observability concerns: diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 3a0806b3b2..143f4241f4 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -20,6 +20,7 @@ camino.workspace = true chrono.workspace = true clap.workspace = true clashmap.workspace = true +compute_api.workspace = true cron.workspace = true fail.workspace = true futures.workspace = true diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 0b5569b3d6..4f0837548f 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -5,7 +5,8 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus, PageserverProtocol}; +use compute_api::spec::PageserverProtocol; +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; From 8e216a3a59f23b45db635eb9a97a76e71c4f4e3b Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Mon, 30 Jun 2025 18:09:50 +0400 Subject: [PATCH 50/50] storcon: notify cplane on safekeeper membership change (#12390) ## Problem We don't notify cplane about safekeeper membership change yet. Without the notification the compute needs to know all the safekeepers on the cluster to be able to speak to them. Change notifications will allow to avoid it. - Closes: https://github.com/neondatabase/neon/issues/12188 ## Summary of changes - Implement `notify_safekeepers` method in `ComputeHook` - Notify cplane about safekeepers in `safekeeper_migrate` handler. - Update the test to make sure notifications work. ## Out of scope - There is `cplane_notified_generation` field in `timelines` table in strocon's database. It's not needed now, so it's not updated in the PR. Probably we can remove it. - e2e tests to make sure it works with a production cplane --- control_plane/src/bin/neon_local.rs | 4 +- control_plane/src/endpoint.rs | 39 +- storage_controller/src/compute_hook.rs | 479 +++++++++++++----- storage_controller/src/reconciler.rs | 4 +- storage_controller/src/service.rs | 8 +- .../src/service/safekeeper_service.rs | 56 +- .../regress/test_safekeeper_migration.py | 22 +- 7 files changed, 456 insertions(+), 156 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index de98d46a55..3440d8979a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -1649,7 +1649,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = parse_safekeepers(&args.safekeepers)?; - endpoint.reconfigure(pageservers, None, safekeepers).await?; + endpoint + .reconfigure(Some(pageservers), None, safekeepers, None) + .await?; } EndpointCmd::Stop(args) => { let endpoint_id = &args.endpoint_id; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 5ea55b28ef..e6fe7d90a2 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -974,12 +974,11 @@ impl Endpoint { pub async fn reconfigure( &self, - pageservers: Vec<(PageserverProtocol, Host, u16)>, + pageservers: Option>, stripe_size: Option, safekeepers: Option>, + safekeeper_generation: Option, ) -> Result<()> { - anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); - let (mut spec, compute_ctl_config) = { let config_path = self.endpoint_path().join("config.json"); let file = std::fs::File::open(config_path)?; @@ -991,16 +990,24 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); - spec.pageserver_connstring = Some(pageserver_connstr); - if stripe_size.is_some() { - spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + // If pageservers are not specified, don't change them. + if let Some(pageservers) = pageservers { + anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); + + let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); + spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } } // If safekeepers are not specified, don't change them. if let Some(safekeepers) = safekeepers { let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; spec.safekeeper_connstrings = safekeeper_connstrings; + if let Some(g) = safekeeper_generation { + spec.safekeepers_generation = Some(g.into_inner()); + } } let client = reqwest::Client::builder() @@ -1038,6 +1045,24 @@ impl Endpoint { } } + pub async fn reconfigure_pageservers( + &self, + pageservers: Vec<(PageserverProtocol, Host, u16)>, + stripe_size: Option, + ) -> Result<()> { + self.reconfigure(Some(pageservers), stripe_size, None, None) + .await + } + + pub async fn reconfigure_safekeepers( + &self, + safekeepers: Vec, + generation: SafekeeperGeneration, + ) -> Result<()> { + self.reconfigure(None, None, Some(safekeepers), Some(generation)) + .await + } + pub async fn stop( &self, mode: EndpointTerminateMode, diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 4f0837548f..ab37a207e4 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -14,11 +14,12 @@ use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; use utils::backoff::{self}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::service::Config; @@ -36,7 +37,7 @@ struct UnshardedComputeHookTenant { preferred_az: Option, // Must hold this lock to send a notification. - send_lock: Arc>>, + send_lock: Arc>>, } struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, @@ -49,7 +50,7 @@ struct ShardedComputeHookTenant { // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. - send_lock: Arc>>, + send_lock: Arc>>, } /// Represents our knowledge of the compute's state: we can update this when we get a @@ -57,9 +58,9 @@ struct ShardedComputeHookTenant { /// /// Should be wrapped in an Option<>, as we cannot always know the remote state. #[derive(PartialEq, Eq, Debug)] -struct ComputeRemoteState { +struct ComputeRemoteState { // The request body which was acked by the compute - request: ComputeHookNotifyRequest, + request: R, // Whether the cplane indicated that the state was applied to running computes, or just // persisted. In the Neon control plane, this is the difference between a 423 response (meaning @@ -67,6 +68,36 @@ struct ComputeRemoteState { applied: bool, } +type ComputeRemoteTenantState = ComputeRemoteState; +type ComputeRemoteTimelineState = ComputeRemoteState; + +/// The trait which define the handler-specific types and methods. +/// We have two implementations of this trait so far: +/// - [`ComputeHookTenant`] for tenant attach notifications ("/notify-attach") +/// - [`ComputeHookTimeline`] for safekeeper change notifications ("/notify-safekeepers") +trait ApiMethod { + /// Type of the key which identifies the resource. + /// It's either TenantId for tenant attach notifications, + /// or TenantTimelineId for safekeeper change notifications. + type Key: std::cmp::Eq + std::hash::Hash + Clone; + + type Request: serde::Serialize + std::fmt::Debug; + + const API_PATH: &'static str; + + fn maybe_send( + &self, + key: Self::Key, + lock: Option>>>, + ) -> MaybeSendResult; + + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &Self::Request, + ) -> Result<(), NotifyError>; +} + enum ComputeHookTenant { Unsharded(UnshardedComputeHookTenant), Sharded(ShardedComputeHookTenant), @@ -97,7 +128,7 @@ impl ComputeHookTenant { } } - fn get_send_lock(&self) -> &Arc>> { + fn get_send_lock(&self) -> &Arc>> { match self { Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, @@ -191,19 +222,136 @@ impl ComputeHookTenant { } } +/// The state of a timeline we need to notify the compute about. +struct ComputeHookTimeline { + generation: SafekeeperGeneration, + safekeepers: Vec, + + send_lock: Arc>>, +} + +impl ComputeHookTimeline { + /// Construct a new ComputeHookTimeline with the given safekeepers and generation. + fn new(generation: SafekeeperGeneration, safekeepers: Vec) -> Self { + Self { + generation, + safekeepers, + send_lock: Arc::default(), + } + } + + /// Update the state with a new SafekeepersUpdate. + /// Noop if the update generation is not greater than the current generation. + fn update(&mut self, sk_update: SafekeepersUpdate) { + if sk_update.generation > self.generation { + self.generation = sk_update.generation; + self.safekeepers = sk_update.safekeepers; + } + } +} + +impl ApiMethod for ComputeHookTimeline { + type Key = TenantTimelineId; + type Request = NotifySafekeepersRequest; + + const API_PATH: &'static str = "notify-safekeepers"; + + fn maybe_send( + &self, + ttid: TenantTimelineId, + lock: Option>>, + ) -> MaybeSendNotifySafekeepersResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::timelines`] lock. + let Ok(locked) = self.send_lock.clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock((ttid, self.send_lock.clone())); + }; + locked + } + }; + + if locked + .as_ref() + .is_some_and(|s| s.request.generation >= self.generation) + { + return MaybeSendResult::Noop; + } + + MaybeSendResult::Transmit(( + NotifySafekeepersRequest { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + generation: self.generation, + safekeepers: self.safekeepers.clone(), + }, + locked, + )) + } + + async fn notify_local( + _env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifySafekeepersRequest, + ) -> Result<(), NotifyError> { + let NotifySafekeepersRequest { + tenant_id, + timeline_id, + generation, + safekeepers, + } = req; + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id + && endpoint.timeline_id == *timeline_id + && endpoint.status() == EndpointStatus::Running + { + tracing::info!("Reconfiguring safekeepers for endpoint {endpoint_name}"); + + let safekeepers = safekeepers.iter().map(|sk| sk.id).collect::>(); + + endpoint + .reconfigure_safekeepers(safekeepers, *generation) + .await + .map_err(NotifyError::NeonLocal)?; + } + } + + Ok(()) + } +} + #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequestShard { +struct NotifyAttachRequestShard { node_id: NodeId, shard_number: ShardNumber, } /// Request body that we send to the control plane to notify it of where a tenant is attached #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequest { +struct NotifyAttachRequest { tenant_id: TenantId, preferred_az: Option, stripe_size: Option, - shards: Vec, + shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] +pub(crate) struct SafekeeperInfo { + pub id: NodeId, + /// Hostname of the safekeeper. + /// It exists for better debuggability. Might be missing. + /// Should not be used for anything else. + pub hostname: Option, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct NotifySafekeepersRequest { + tenant_id: TenantId, + timeline_id: TimelineId, + generation: SafekeeperGeneration, + safekeepers: Vec, } /// Error type for attempts to call into the control plane compute notification hook @@ -235,42 +383,50 @@ pub(crate) enum NotifyError { NeonLocal(anyhow::Error), } -enum MaybeSendResult { +enum MaybeSendResult { // Please send this request while holding the lock, and if you succeed then write // the request into the lock. Transmit( ( - ComputeHookNotifyRequest, - tokio::sync::OwnedMutexGuard>, + R, + tokio::sync::OwnedMutexGuard>>, ), ), // Something requires sending, but you must wait for a current sender then call again - AwaitLock(Arc>>), + AwaitLock((K, Arc>>>)), // Nothing requires sending Noop, } -impl ComputeHookTenant { +type MaybeSendNotifyAttachResult = MaybeSendResult; +type MaybeSendNotifySafekeepersResult = MaybeSendResult; + +impl ApiMethod for ComputeHookTenant { + type Key = TenantId; + type Request = NotifyAttachRequest; + + const API_PATH: &'static str = "notify-attach"; + fn maybe_send( &self, tenant_id: TenantId, - lock: Option>>, - ) -> MaybeSendResult { + lock: Option>>, + ) -> MaybeSendNotifyAttachResult { let locked = match lock { Some(already_locked) => already_locked, None => { - // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::tenants`] lock. let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { - return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + return MaybeSendResult::AwaitLock((tenant_id, self.get_send_lock().clone())); }; locked } }; let request = match self { - Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + Self::Unsharded(unsharded_tenant) => Some(NotifyAttachRequest { tenant_id, - shards: vec![ComputeHookNotifyRequestShard { + shards: vec![NotifyAttachRequestShard { shard_number: ShardNumber(0), node_id: unsharded_tenant.node_id, }], @@ -283,12 +439,12 @@ impl ComputeHookTenant { Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => { - Some(ComputeHookNotifyRequest { + Some(NotifyAttachRequest { tenant_id, shards: sharded_tenant .shards .iter() - .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + .map(|(shard_number, node_id)| NotifyAttachRequestShard { shard_number: *shard_number, node_id: *node_id, }) @@ -333,98 +489,22 @@ impl ComputeHookTenant { } } } -} -/// The compute hook is a destination for notifications about changes to tenant:pageserver -/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures -/// the compute connection string. -pub(super) struct ComputeHook { - config: Config, - state: std::sync::Mutex>, - authorization_header: Option, - - // Concurrency limiter, so that we do not overload the cloud control plane when updating - // large numbers of tenants (e.g. when failing over after a node failure) - api_concurrency: tokio::sync::Semaphore, - - // This lock is only used in testing enviroments, to serialize calls into neon_lock - neon_local_lock: tokio::sync::Mutex<()>, - - // We share a client across all notifications to enable connection re-use etc when - // sending large numbers of notifications - client: reqwest::Client, -} - -/// Callers may give us a list of these when asking us to send a bulk batch -/// of notifications in the background. This is a 'notification' in the sense of -/// other code notifying us of a shard's status, rather than being the final notification -/// that we send upwards to the control plane for the whole tenant. -pub(crate) struct ShardUpdate<'a> { - pub(crate) tenant_shard_id: TenantShardId, - pub(crate) node_id: NodeId, - pub(crate) stripe_size: ShardStripeSize, - pub(crate) preferred_az: Option>, -} - -impl ComputeHook { - pub(super) fn new(config: Config) -> anyhow::Result { - let authorization_header = config - .control_plane_jwt_token - .clone() - .map(|jwt| format!("Bearer {jwt}")); - - let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); - for cert in &config.ssl_ca_certs { - client = client.add_root_certificate(cert.clone()); - } - let client = client - .build() - .context("Failed to build http client for compute hook")?; - - Ok(Self { - state: Default::default(), - config, - authorization_header, - neon_local_lock: Default::default(), - api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), - client, - }) - } - - /// For test environments: use neon_local's LocalEnv to update compute - async fn do_notify_local( - &self, - reconfigure_request: &ComputeHookNotifyRequest, + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifyAttachRequest, ) -> Result<(), NotifyError> { - // neon_local updates are not safe to call concurrently, use a lock to serialize - // all calls to this function - let _locked = self.neon_local_lock.lock().await; - - let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { - tracing::warn!( - "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" - ); - return Ok(()); - }; - let env = match LocalEnv::load_config(repo_dir) { - Ok(e) => e, - Err(e) => { - tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); - return Ok(()); - } - }; - let cplane = - ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); - let ComputeHookNotifyRequest { + let NotifyAttachRequest { tenant_id, shards, stripe_size, preferred_az: _preferred_az, - } = reconfigure_request; + } = req; for (endpoint_name, endpoint) in &cplane.endpoints { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { - tracing::info!("Reconfiguring endpoint {endpoint_name}"); + tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}"); let pageservers = shards .iter() @@ -446,7 +526,7 @@ impl ComputeHook { .collect::>(); endpoint - .reconfigure(pageservers, *stripe_size, None) + .reconfigure_pageservers(pageservers, *stripe_size) .await .map_err(NotifyError::NeonLocal)?; } @@ -454,11 +534,102 @@ impl ComputeHook { Ok(()) } +} - async fn do_notify_iteration( +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + tenants: std::sync::Mutex>, + timelines: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_local + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + +pub(crate) struct SafekeepersUpdate { + pub(crate) tenant_id: TenantId, + pub(crate) timeline_id: TimelineId, + pub(crate) generation: SafekeeperGeneration, + pub(crate) safekeepers: Vec, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> anyhow::Result { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {jwt}")); + + let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); + for cert in &config.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client + .build() + .context("Failed to build http client for compute hook")?; + + Ok(Self { + tenants: Default::default(), + timelines: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + }) + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local(&self, req: &M::Request) -> Result<(), NotifyError> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { + tracing::warn!( + "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" + ); + return Ok(()); + }; + let env = match LocalEnv::load_config(repo_dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + + M::notify_local(&env, &cplane, req).await + } + + async fn do_notify_iteration( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &Req, cancel: &CancellationToken, ) -> Result<(), NotifyError> { let req = self.client.request(reqwest::Method::PUT, url); @@ -480,9 +651,7 @@ impl ComputeHook { }; // Treat all 2xx responses as success - if response.status() >= reqwest::StatusCode::OK - && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES - { + if response.status().is_success() { if response.status() != reqwest::StatusCode::OK { // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so // log a warning. @@ -533,10 +702,10 @@ impl ComputeHook { } } - async fn do_notify( + async fn do_notify( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &R, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // We hold these semaphore units across all retries, rather than only across each @@ -568,13 +737,13 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { - let mut state_locked = self.state.lock().unwrap(); + fn notify_attach_prepare(&self, shard_update: ShardUpdate) -> MaybeSendNotifyAttachResult { + let mut tenants_locked = self.tenants.lock().unwrap(); use std::collections::hash_map::Entry; let tenant_shard_id = shard_update.tenant_shard_id; - let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + let tenant = match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(e) => { let ShardUpdate { tenant_shard_id, @@ -598,10 +767,37 @@ impl ComputeHook { tenant.maybe_send(tenant_shard_id.tenant_id, None) } - async fn notify_execute( + fn notify_safekeepers_prepare( &self, - maybe_send_result: MaybeSendResult, - tenant_shard_id: TenantShardId, + safekeepers_update: SafekeepersUpdate, + ) -> MaybeSendNotifySafekeepersResult { + let mut timelines_locked = self.timelines.lock().unwrap(); + + let ttid = TenantTimelineId { + tenant_id: safekeepers_update.tenant_id, + timeline_id: safekeepers_update.timeline_id, + }; + + use std::collections::hash_map::Entry; + let timeline = match timelines_locked.entry(ttid) { + Entry::Vacant(e) => e.insert(ComputeHookTimeline::new( + safekeepers_update.generation, + safekeepers_update.safekeepers, + )), + Entry::Occupied(e) => { + let timeline = e.into_mut(); + timeline.update(safekeepers_update); + timeline + } + }; + + timeline.maybe_send(ttid, None) + } + + async fn notify_execute( + &self, + state: &std::sync::Mutex>, + maybe_send_result: MaybeSendResult, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // Process result: we may get an update to send, or we may have to wait for a lock @@ -610,7 +806,7 @@ impl ComputeHook { MaybeSendResult::Noop => { return Ok(()); } - MaybeSendResult::AwaitLock(send_lock) => { + MaybeSendResult::AwaitLock((key, send_lock)) => { let send_locked = tokio::select! { guard = send_lock.lock_owned() => {guard}, _ = cancel.cancelled() => { @@ -621,11 +817,11 @@ impl ComputeHook { // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses // try_lock. - let state_locked = self.state.lock().unwrap(); - let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + let state_locked = state.lock().unwrap(); + let Some(resource_state) = state_locked.get(&key) else { return Ok(()); }; - match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + match resource_state.maybe_send(key, Some(send_locked)) { MaybeSendResult::AwaitLock(_) => { unreachable!("We supplied lock guard") } @@ -644,14 +840,18 @@ impl ComputeHook { .control_plane_url .as_ref() .map(|control_plane_url| { - format!("{}/notify-attach", control_plane_url.trim_end_matches('/')) + format!( + "{}/{}", + control_plane_url.trim_end_matches('/'), + M::API_PATH + ) }); // We validate this at startup let notify_url = compute_hook_url.as_ref().unwrap(); self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local(&request).await.map_err(|e| { + self.do_notify_local::(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. tracing::error!("neon_local notification hook failed: {e}"); NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) @@ -687,7 +887,7 @@ impl ComputeHook { /// Infallible synchronous fire-and-forget version of notify(), that sends its results to /// a channel. Something should consume the channel and arrange to try notifying again /// if something failed. - pub(super) fn notify_background( + pub(super) fn notify_attach_background( self: &Arc, notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, @@ -696,7 +896,7 @@ impl ComputeHook { let mut maybe_sends = Vec::new(); for shard_update in notifications { let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); + let maybe_send_result = self.notify_attach_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -715,10 +915,10 @@ impl ComputeHook { async move { this - .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .notify_execute(&this.tenants, maybe_send_result, &cancel) .await.map_err(|e| (tenant_shard_id, e)) }.instrument(info_span!( - "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + "notify_attach_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() )) }) .buffered(API_CONCURRENCY); @@ -761,14 +961,23 @@ impl ComputeHook { /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify<'a>( + pub(super) async fn notify_attach<'a>( &self, shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); - self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + let maybe_send_result = self.notify_attach_prepare(shard_update); + self.notify_execute(&self.tenants, maybe_send_result, cancel) + .await + } + + pub(super) async fn notify_safekeepers( + &self, + safekeepers_update: SafekeepersUpdate, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_safekeepers_prepare(safekeepers_update); + self.notify_execute(&self.timelines, maybe_send_result, cancel) .await } @@ -784,8 +993,8 @@ impl ComputeHook { ) { use std::collections::hash_map::Entry; - let mut state_locked = self.state.lock().unwrap(); - match state_locked.entry(tenant_shard_id.tenant_id) { + let mut tenants_locked = self.tenants.lock().unwrap(); + match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { // This is a valid but niche case, where the tenant was previously attached // as a Secondary location and then detached, so has no previously notified diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 92844c9c7b..a2fba0fa56 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -65,7 +65,7 @@ pub(super) struct Reconciler { pub(crate) compute_hook: Arc, /// To avoid stalling if the cloud control plane is unavailable, we may proceed - /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// past failures in [`ComputeHook::notify_attach`], but we _must_ remember that we failed /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, @@ -1023,7 +1023,7 @@ impl Reconciler { if let Some(node) = &self.intent.attached { let result = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: self.tenant_shard_id, node_id: node.get_id(), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e0b13c4e63..e4c494db8f 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -878,18 +878,18 @@ impl Service { // Emit compute hook notifications for all tenants which are already stably attached. Other tenants // will emit compute hook notifications when they reconcile. // - // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // Ordering: our calls to notify_attach_background synchronously establish a relative order for these notifications vs. any later // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later // calls will be correctly ordered wrt these. // - // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // Concurrency: we call notify_attach_background for all tenants, which will create O(N) tokio tasks, but almost all of them // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore // unit and start doing I/O. tracing::info!( "Sending {} compute notifications", compute_notifications.len() ); - self.compute_hook.notify_background( + self.compute_hook.notify_attach_background( compute_notifications, bg_compute_notify_result_tx.clone(), &self.cancel, @@ -6281,7 +6281,7 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: child_id, node_id: child_ps, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fc33a24198..cf48b007b2 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; +use crate::compute_hook; use crate::heartbeater::SafekeeperState; use crate::id_lock_map::trace_shared_lock; use crate::metrics; @@ -1198,7 +1199,11 @@ impl Service { // 4. Call PUT configuration on safekeepers from the current set, // delivering them joint_conf. - // TODO(diko): need to notify cplane with an updated set of safekeepers. + // Notify cplane/compute about the membership change BEFORE changing the membership on safekeepers. + // This way the compute will know about new safekeepers from joint_config before we require to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &joint_config) + .await?; let results = self .tenant_timeline_set_membership_quorum( @@ -1305,8 +1310,55 @@ impl Service { ) .await?; - // TODO(diko): need to notify cplane with an updated set of safekeepers. + // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers. + // This way the compute will stop talking to excluded safekeepers only after we stop requiring to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf) + .await?; Ok(()) } + + /// Notify cplane about safekeeper membership change. + /// The cplane will receive a joint set of safekeepers as a safekeeper list. + async fn cplane_notify_safekeepers( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + mconf: &membership::Configuration, + ) -> Result<(), ApiError> { + let mut safekeepers = Vec::new(); + let mut ids: HashSet<_> = HashSet::new(); + + for member in mconf + .members + .m + .iter() + .chain(mconf.new_members.iter().flat_map(|m| m.m.iter())) + { + if ids.insert(member.id) { + safekeepers.push(compute_hook::SafekeeperInfo { + id: member.id, + hostname: Some(member.host.clone()), + }); + } + } + + self.compute_hook + .notify_safekeepers( + compute_hook::SafekeepersUpdate { + tenant_id, + timeline_id, + generation: mconf.generation, + safekeepers, + }, + &self.cancel, + ) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "failed to notify cplane about safekeeper membership change: {err}" + )) + }) + } } diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py index f67b6afc95..057371175c 100644 --- a/test_runner/regress/test_safekeeper_migration.py +++ b/test_runner/regress/test_safekeeper_migration.py @@ -32,10 +32,13 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): ) ep = env.endpoints.create("main", tenant_id=env.initial_tenant) - # We specify all safekeepers, so compute will connect to all of them. - # Only those from the current membership configuration will be used. - # TODO(diko): set only current safekeepers when cplane notify is implemented. - ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] is None + assert len(mconf["sk_set"]) == 1 + assert mconf["generation"] == 1 + + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) ep.safe_psql("CREATE EXTENSION neon_test_utils;") ep.safe_psql("CREATE TABLE t(a int)") @@ -58,7 +61,16 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + # 1 initial generation + 2 migrations on each loop iteration. + expected_gen = 1 + 2 * 3 + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == expected_gen + + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{expected_gen}:") + + # Restart and check again to make sure data is persistent. ep.stop() - ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.start(safekeeper_generation=1, safekeepers=[3]) assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]